File size: 3,716 Bytes
15de73a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
#!/usr/bin/env python3
"""

Test sentence-transformers to isolate the segfault.

"""

import sys
import os

print("=" * 60)
print("SENTENCE TRANSFORMERS TEST")
print("=" * 60)
print(f"Python version: {sys.version}")
print()

# Test 1: Import sentence_transformers
print("Test 1: Import sentence_transformers...")
try:
    from sentence_transformers import SentenceTransformer
    print(f"  βœ“ sentence_transformers imported")
except Exception as e:
    print(f"  ❌ Failed: {e}")
    sys.exit(1)

# Test 2: Load model
print("\nTest 2: Load model (this downloads ~90MB on first run)...")
try:
    os.environ['TOKENIZERS_PARALLELISM'] = 'false'
    model = SentenceTransformer('all-MiniLM-L6-v2')
    print(f"  βœ“ Model loaded")
except Exception as e:
    print(f"  ❌ Failed: {e}")
    import traceback
    traceback.print_exc()
    sys.exit(1)

# Test 3: Encode simple text
print("\nTest 3: Encode simple text...")
try:
    text = "This is a test sentence."
    embedding = model.encode([text])
    print(f"  βœ“ Encoded text, embedding shape: {embedding.shape}")
except Exception as e:
    print(f"  ❌ Failed: {e}")
    import traceback
    traceback.print_exc()
    sys.exit(1)

# Test 4: Encode batch
print("\nTest 4: Encode batch of texts...")
try:
    texts = ["First sentence", "Second sentence", "Third sentence"]
    embeddings = model.encode(texts, show_progress_bar=False)
    print(f"  βœ“ Encoded {len(texts)} texts, shape: {embeddings.shape}")
except Exception as e:
    print(f"  ❌ Failed: {e}")
    import traceback
    traceback.print_exc()
    sys.exit(1)

# Test 5: Encode with explicit parameters
print("\nTest 5: Encode with explicit parameters (like in our script)...")
try:
    embeddings = model.encode(
        texts,
        show_progress_bar=False,
        convert_to_numpy=True,
        normalize_embeddings=False,
        device='cpu'
    )
    print(f"  βœ“ Encoded with explicit params, shape: {embeddings.shape}")
except Exception as e:
    print(f"  ❌ Failed: {e}")
    import traceback
    traceback.print_exc()
    sys.exit(1)

# Test 6: Encode larger batch
print("\nTest 6: Encode larger batch (100 texts)...")
try:
    large_texts = [f"This is test sentence number {i}" for i in range(100)]
    embeddings = model.encode(
        large_texts,
        show_progress_bar=False,
        convert_to_numpy=True,
        normalize_embeddings=False,
        device='cpu'
    )
    print(f"  βœ“ Encoded {len(large_texts)} texts, shape: {embeddings.shape}")
except Exception as e:
    print(f"  ❌ Failed: {e}")
    import traceback
    traceback.print_exc()
    sys.exit(1)

# Test 7: Test with actual biography-like text
print("\nTest 7: Encode biography-like text...")
try:
    bio = """A Representative from Illinois and 16th President of the United States;

    born in Hardin County, Ky., February 12, 1809; moved with his parents to a tract

    on Little Pigeon Creek, Ind., in 1816; attended a log-cabin school at short intervals

    and was self-instructed in elementary branches."""

    embedding = model.encode([bio], show_progress_bar=False, device='cpu')
    print(f"  βœ“ Encoded biography, shape: {embedding.shape}")
except Exception as e:
    print(f"  ❌ Failed: {e}")
    import traceback
    traceback.print_exc()
    sys.exit(1)

print("\n" + "=" * 60)
print("βœ… ALL TESTS PASSED!")
print("=" * 60)
print("\nSentence transformers is working correctly.")
print("The issue may be with the combination of:")
print("  - Very large batch processing")
print("  - Integration with FAISS normalize")
print("  - Memory management with 13k+ texts")