Spaces:

stefanjwojcik
/

BioGuideMCP

Running

File size: 3,716 Bytes

15de73a

#!/usr/bin/env python3
"""

Test sentence-transformers to isolate the segfault.

"""

import sys
import os

print("=" * 60)
print("SENTENCE TRANSFORMERS TEST")
print("=" * 60)
print(f"Python version: {sys.version}")
print()

# Test 1: Import sentence_transformers
print("Test 1: Import sentence_transformers...")
try:
    from sentence_transformers import SentenceTransformer
    print(f"  ✓ sentence_transformers imported")
except Exception as e:
    print(f"  ❌ Failed: {e}")
    sys.exit(1)

# Test 2: Load model
print("\nTest 2: Load model (this downloads ~90MB on first run)...")
try:
    os.environ['TOKENIZERS_PARALLELISM'] = 'false'
    model = SentenceTransformer('all-MiniLM-L6-v2')
    print(f"  ✓ Model loaded")
except Exception as e:
    print(f"  ❌ Failed: {e}")
    import traceback
    traceback.print_exc()
    sys.exit(1)

# Test 3: Encode simple text
print("\nTest 3: Encode simple text...")
try:
    text = "This is a test sentence."
    embedding = model.encode([text])
    print(f"  ✓ Encoded text, embedding shape: {embedding.shape}")
except Exception as e:
    print(f"  ❌ Failed: {e}")
    import traceback
    traceback.print_exc()
    sys.exit(1)

# Test 4: Encode batch
print("\nTest 4: Encode batch of texts...")
try:
    texts = ["First sentence", "Second sentence", "Third sentence"]
    embeddings = model.encode(texts, show_progress_bar=False)
    print(f"  ✓ Encoded {len(texts)} texts, shape: {embeddings.shape}")
except Exception as e:
    print(f"  ❌ Failed: {e}")
    import traceback
    traceback.print_exc()
    sys.exit(1)

# Test 5: Encode with explicit parameters
print("\nTest 5: Encode with explicit parameters (like in our script)...")
try:
    embeddings = model.encode(
        texts,
        show_progress_bar=False,
        convert_to_numpy=True,
        normalize_embeddings=False,
        device='cpu'
    )
    print(f"  ✓ Encoded with explicit params, shape: {embeddings.shape}")
except Exception as e:
    print(f"  ❌ Failed: {e}")
    import traceback
    traceback.print_exc()
    sys.exit(1)

# Test 6: Encode larger batch
print("\nTest 6: Encode larger batch (100 texts)...")
try:
    large_texts = [f"This is test sentence number {i}" for i in range(100)]
    embeddings = model.encode(
        large_texts,
        show_progress_bar=False,
        convert_to_numpy=True,
        normalize_embeddings=False,
        device='cpu'
    )
    print(f"  ✓ Encoded {len(large_texts)} texts, shape: {embeddings.shape}")
except Exception as e:
    print(f"  ❌ Failed: {e}")
    import traceback
    traceback.print_exc()
    sys.exit(1)

# Test 7: Test with actual biography-like text
print("\nTest 7: Encode biography-like text...")
try:
    bio = """A Representative from Illinois and 16th President of the United States;

    born in Hardin County, Ky., February 12, 1809; moved with his parents to a tract

    on Little Pigeon Creek, Ind., in 1816; attended a log-cabin school at short intervals

    and was self-instructed in elementary branches."""

    embedding = model.encode([bio], show_progress_bar=False, device='cpu')
    print(f"  ✓ Encoded biography, shape: {embedding.shape}")
except Exception as e:
    print(f"  ❌ Failed: {e}")
    import traceback
    traceback.print_exc()
    sys.exit(1)

print("\n" + "=" * 60)
print("✅ ALL TESTS PASSED!")
print("=" * 60)
print("\nSentence transformers is working correctly.")
print("The issue may be with the combination of:")
print("  - Very large batch processing")
print("  - Integration with FAISS normalize")
print("  - Memory management with 13k+ texts")