Moreover does this issue affect option a pregenerated embedd typesense #community-help

Moreover, does this issue affect option a pregener...
Marius Wilsch
08/28/2025, 9:54 AM
Moreover, does this issue affect option a pregenerated embeddings as well? https://github.com/typesense/typesense/issues/2060
Kishore Nallan
08/28/2025, 10:07 AM
We disable the prefix option when remote embedding is used because typically type-ahead use cases require low latency and remote api calls are a huge foot gun because they will be so slow.
Marius Wilsch
08/28/2025, 10:08 AM
but with pre-generated embeddings that isn't a issue?
Kishore Nallan
08/28/2025, 10:10 AM
I'm not sure about that. Please try it out and let me know, perhaps we can enable it.
Marius Wilsch
08/28/2025, 10:10 AM
So far my tests have shown that it seems to work. I recieved different results with hybrid search vs pure semantic or keyword
Marius Wilsch
08/28/2025, 10:11 AM
The following test script was used
Copy code
#!/usr/bin/env python3
"""
Test Rank Fusion (RRF) in Hybrid Search
=======================================

Tests whether hybrid search actually combines keyword + vector results
or if one dominates the other.

Expected: Different rankings between pure keyword, pure vector, and hybrid search
"""

import json
import requests
import typesense
import sys
from typing import Dict, List, Any

# Configuration
TYPESENSE_HOST = "localhost"
TYPESENSE_PORT = "8108" 
TYPESENSE_API_KEY = "xyz123"
TEI_URL = "<http://localhost:8080/v1/embeddings>"
TEST_COLLECTION = "rrf_test"

def setup_client():
    """Initialize Typesense client"""
    client = typesense.Client({
        'api_key': TYPESENSE_API_KEY,
        'nodes': [
            {
                'host': TYPESENSE_HOST,
                'port': TYPESENSE_PORT,
                'protocol': 'http'
            }
        ],
        'connection_timeout_seconds': 10
    })
    return client

def cleanup_collection(client):
    """Remove test collection if it exists"""
    try:
        client.collections[TEST_COLLECTION].delete()
        print(f"✓ Cleaned up existing collection '{TEST_COLLECTION}'")
    except:
        print(f"✓ Collection '{TEST_COLLECTION}' doesn't exist")

def create_test_collection(client):
    """Create collection with manual vector field"""
    schema = {
        "name": TEST_COLLECTION,
        "fields": [
            {"name": "id", "type": "string"},
            {"name": "content", "type": "string"},
            {"name": "embedding", "type": "float[]", "num_dim": 2560}
        ]
    }
    
    try:
        client.collections.create(schema)
        print(f"✓ Created collection '{TEST_COLLECTION}'")
        return True
    except Exception as e:
        print(f"✗ Failed to create collection: {e}")
        return False

def get_embedding(text: str) -> List[float]:
    """Generate embedding using TEI"""
    try:
        response = <http://requests.post|requests.post>(TEI_URL, 
            headers={"Content-Type": "application/json"},
            json={"input": text, "model": "text-embeddings-inference"},
            timeout=30
        )
        response.raise_for_status()
        data = response.json()
        return data['data'][0]['embedding']
    except Exception as e:
        print(f"✗ Failed to generate embedding for '{text}': {e}")
        return []

def setup_test_documents(client):
    """Create documents that will rank differently in keyword vs semantic search"""
    test_docs = [
        # Document that should rank HIGH in keyword search for "Schaden"
        {"id": "1", "content": "Schaden Schaden Schaden - multiple keyword matches here"},
        
        # Document semantically related but no keyword match
        {"id": "2", "content": "Entschädigung und Haftungsansprüche bei Vertragsverletzung"},
        
        # Document with different semantic meaning
        {"id": "3", "content": "Datenschutz Verordnung bezüglich personenbezogener Daten"},
        
        # Mixed content - some keyword, some semantic
        {"id": "4", "content": "Schadensersatz Forderungen und finanzielle Kompensation"},
        
        # Unrelated content
        {"id": "5", "content": "Urteil des Landgerichts über Mietverträge"}
    ]
    
    # Generate embeddings for each document
    documents = []
    for doc in test_docs:
        print(f"Generating embedding for doc {doc['id']}: '{doc['content'][:40]}...'")
        embedding = get_embedding(doc["content"])
        if embedding:
            doc["embedding"] = embedding
            documents.append(doc)
    
    # Index documents
    for doc in documents:
        client.collections[TEST_COLLECTION].documents.create(doc)
        print(f"✓ Indexed document {doc['id']}")
    
    return len(documents)

def test_pure_keyword_search(client, query_text):
    """Test A: Pure keyword search only"""
    print(f"\n{'='*60}")
    print(f"TEST A: Pure Keyword Search - '{query_text}'")
    print('='*60)
    
    search_requests = {
        'searches': [{
            'collection': TEST_COLLECTION,
            'q': query_text,
            'query_by': 'content',
            'per_page': 10
        }]
    }
    
    try:
        result = client.multi_search.perform(search_requests, {})
        search_result = result['results'][0]
        
        print(f"Found {search_result['found']} documents")
        rankings = []
        for i, hit in enumerate(search_result['hits']):
            doc = hit['document']
            text_match = hit.get('text_match', 0)
            rankings.append((doc['id'], text_match))
            print(f"  {i+1}. ID:{doc['id']} TextMatch:{text_match:.4f} '{doc['content'][:50]}...'")
        
        return rankings
        
    except Exception as e:
        print(f"✗ Keyword search failed: {e}")
        return []

def test_pure_vector_search(client, query_text):
    """Test B: Pure vector search only"""
    print(f"\n{'='*60}")
    print(f"TEST B: Pure Vector Search - '{query_text}'")
    print('='*60)
    
    # Generate query embedding
    query_embedding = get_embedding(query_text)
    if not query_embedding:
        return []
    
    search_requests = {
        'searches': [{
            'collection': TEST_COLLECTION,
            'q': '*',  # Wildcard to ignore text matching
            'vector_query': f'embedding:({json.dumps(query_embedding)}, k:10)',
            'exclude_fields': 'embedding'
        }]
    }
    
    try:
        result = client.multi_search.perform(search_requests, {})
        search_result = result['results'][0]
        
        print(f"Found {search_result['found']} documents")
        rankings = []
        for i, hit in enumerate(search_result['hits']):
            doc = hit['document']
            vector_dist = hit.get('vector_distance', 999)
            rankings.append((doc['id'], vector_dist))
            print(f"  {i+1}. ID:{doc['id']} VectorDist:{vector_dist:.4f} '{doc['content'][:50]}...'")
        
        return rankings
        
    except Exception as e:
        print(f"✗ Vector search failed: {e}")
        return []

def test_hybrid_search(client, query_text):
    """Test C: Hybrid search (RRF)"""
    print(f"\n{'='*60}")
    print(f"TEST C: Hybrid Search (RRF) - '{query_text}'")
    print('='*60)
    
    # Generate query embedding
    query_embedding = get_embedding(query_text)
    if not query_embedding:
        return []
    
    search_requests = {
        'searches': [{
            'collection': TEST_COLLECTION,
            'q': query_text,
            'query_by': 'content',
            'vector_query': f'embedding:({json.dumps(query_embedding)}, k:10)',
            'exclude_fields': 'embedding',
            'sort_by': '_text_match:desc'  # Use fusion score
        }]
    }
    
    try:
        result = client.multi_search.perform(search_requests, {})
        search_result = result['results'][0]
        
        print(f"Found {search_result['found']} documents")
        rankings = []
        for i, hit in enumerate(search_result['hits']):
            doc = hit['document']
            text_match = hit.get('text_match', 0)
            vector_dist = hit.get('vector_distance', 999)
            rankings.append((doc['id'], text_match, vector_dist))
            print(f"  {i+1}. ID:{doc['id']} TextMatch:{text_match:.4f} VectorDist:{vector_dist:.4f} '{doc['content'][:50]}...'")
        
        return rankings
        
    except Exception as e:
        print(f"✗ Hybrid search failed: {e}")
        return []

def analyze_rankings(keyword_ranks, vector_ranks, hybrid_ranks, query_text):
    """Analyze if RRF is actually working"""
    print(f"\n{'='*60}")
    print(f"ANALYSIS: Is RRF Working for '{query_text}'?")
    print('='*60)
    
    if not all([keyword_ranks, vector_ranks, hybrid_ranks]):
        print("❌ Cannot analyze - missing search results")
        return False
    
    # Extract just the document ID rankings
    keyword_order = [rank[0] for rank in keyword_ranks]
    vector_order = [rank[0] for rank in vector_ranks]  
    hybrid_order = [rank[0] for rank in hybrid_ranks]
    
    print(f"Keyword ranking: {keyword_order}")
    print(f"Vector ranking:  {vector_order}")
    print(f"Hybrid ranking:  {hybrid_order}")
    
    # Check if hybrid is different from both pure approaches
    hybrid_matches_keyword = hybrid_order == keyword_order
    hybrid_matches_vector = hybrid_order == vector_order
    
    if hybrid_matches_keyword:
        print("⚠️  Hybrid ranking MATCHES keyword ranking - keyword search dominating?")
    elif hybrid_matches_vector:
        print("⚠️  Hybrid ranking MATCHES vector ranking - vector search dominating?")
    else:
        print("✅ Hybrid ranking is DIFFERENT from both - RRF fusion is working!")
        return True
    
    return False

def main():
    """Run the RRF fusion test"""
    print("Testing Rank Fusion (RRF) in Hybrid Search")
    print("="*50)
    
    client = setup_client()
    cleanup_collection(client)
    
    if not create_test_collection(client):
        return 1
    
    doc_count = setup_test_documents(client)
    if doc_count == 0:
        return 1
    
    print(f"\nWaiting for indexing of {doc_count} documents...")
    import time
    time.sleep(3)
    
    # Test with a query that should show differences
    query_text = "Schaden"  # Should find exact match + semantic matches
    
    # Run all three search types
    keyword_results = test_pure_keyword_search(client, query_text)
    vector_results = test_pure_vector_search(client, query_text)
    hybrid_results = test_hybrid_search(client, query_text)
    
    # Analyze results
    rrf_working = analyze_rankings(keyword_results, vector_results, hybrid_results, query_text)
    
    # Test with another query
    query_text2 = "Kompensation"  # Semantic concept, not exact keyword
    print(f"\n{'='*60}")
    print(f"TESTING SECOND QUERY: '{query_text2}'")
    print('='*60)
    
    keyword_results2 = test_pure_keyword_search(client, query_text2)
    vector_results2 = test_pure_vector_search(client, query_text2)
    hybrid_results2 = test_hybrid_search(client, query_text2)
    
    rrf_working2 = analyze_rankings(keyword_results2, vector_results2, hybrid_results2, query_text2)
    
    # Final verdict
    print(f"\n{'='*60}")
    print("FINAL VERDICT")
    print('='*60)
    
    if rrf_working or rrf_working2:
        print("✅ RRF (Rank Fusion) IS WORKING!")
        print("   → Hybrid search combines keyword + vector rankings")
        print("   → Rankings differ from pure keyword or pure vector")
        print("   → Our hybrid search implementation is effective")
    else:
        print("❌ RRF might not be working as expected")
        print("   → Hybrid search may be dominated by one signal")
        print("   → Need to investigate alpha parameter or other settings")
    
    cleanup_collection(client)
    return 0 if (rrf_working or rrf_working2) else 1

if __name__ == "__main__":
    sys.exit(main())
Kishore Nallan
08/28/2025, 10:11 AM
Ok cool that's good then
Marius Wilsch
08/28/2025, 10:11 AM
But yah I'm not entirely sure if my tests are valid. Only have been using this tool since yesterday
10 Views
Open in Slack
Previous Next