Spaces:
Running
Running
File size: 3,716 Bytes
15de73a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 |
#!/usr/bin/env python3
"""
Test sentence-transformers to isolate the segfault.
"""
import sys
import os
print("=" * 60)
print("SENTENCE TRANSFORMERS TEST")
print("=" * 60)
print(f"Python version: {sys.version}")
print()
# Test 1: Import sentence_transformers
print("Test 1: Import sentence_transformers...")
try:
from sentence_transformers import SentenceTransformer
print(f" β sentence_transformers imported")
except Exception as e:
print(f" β Failed: {e}")
sys.exit(1)
# Test 2: Load model
print("\nTest 2: Load model (this downloads ~90MB on first run)...")
try:
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
model = SentenceTransformer('all-MiniLM-L6-v2')
print(f" β Model loaded")
except Exception as e:
print(f" β Failed: {e}")
import traceback
traceback.print_exc()
sys.exit(1)
# Test 3: Encode simple text
print("\nTest 3: Encode simple text...")
try:
text = "This is a test sentence."
embedding = model.encode([text])
print(f" β Encoded text, embedding shape: {embedding.shape}")
except Exception as e:
print(f" β Failed: {e}")
import traceback
traceback.print_exc()
sys.exit(1)
# Test 4: Encode batch
print("\nTest 4: Encode batch of texts...")
try:
texts = ["First sentence", "Second sentence", "Third sentence"]
embeddings = model.encode(texts, show_progress_bar=False)
print(f" β Encoded {len(texts)} texts, shape: {embeddings.shape}")
except Exception as e:
print(f" β Failed: {e}")
import traceback
traceback.print_exc()
sys.exit(1)
# Test 5: Encode with explicit parameters
print("\nTest 5: Encode with explicit parameters (like in our script)...")
try:
embeddings = model.encode(
texts,
show_progress_bar=False,
convert_to_numpy=True,
normalize_embeddings=False,
device='cpu'
)
print(f" β Encoded with explicit params, shape: {embeddings.shape}")
except Exception as e:
print(f" β Failed: {e}")
import traceback
traceback.print_exc()
sys.exit(1)
# Test 6: Encode larger batch
print("\nTest 6: Encode larger batch (100 texts)...")
try:
large_texts = [f"This is test sentence number {i}" for i in range(100)]
embeddings = model.encode(
large_texts,
show_progress_bar=False,
convert_to_numpy=True,
normalize_embeddings=False,
device='cpu'
)
print(f" β Encoded {len(large_texts)} texts, shape: {embeddings.shape}")
except Exception as e:
print(f" β Failed: {e}")
import traceback
traceback.print_exc()
sys.exit(1)
# Test 7: Test with actual biography-like text
print("\nTest 7: Encode biography-like text...")
try:
bio = """A Representative from Illinois and 16th President of the United States;
born in Hardin County, Ky., February 12, 1809; moved with his parents to a tract
on Little Pigeon Creek, Ind., in 1816; attended a log-cabin school at short intervals
and was self-instructed in elementary branches."""
embedding = model.encode([bio], show_progress_bar=False, device='cpu')
print(f" β Encoded biography, shape: {embedding.shape}")
except Exception as e:
print(f" β Failed: {e}")
import traceback
traceback.print_exc()
sys.exit(1)
print("\n" + "=" * 60)
print("β
ALL TESTS PASSED!")
print("=" * 60)
print("\nSentence transformers is working correctly.")
print("The issue may be with the combination of:")
print(" - Very large batch processing")
print(" - Integration with FAISS normalize")
print(" - Memory management with 13k+ texts")
|