Indexer Internals
Deep dive into the indexing architecture, code parsing, embedding generation, and vector storage.
Architecture
%%{init: {'theme':'neutral'}}%%
sequenceDiagram
participant Git
participant Hook
participant Indexer
participant Parser
participant Embedder
participant Qdrant
Git->>Hook: git commit
Hook->>Indexer: Changed files
Indexer->>Parser: Parse files
Parser->>Parser: Extract components
loop For each component
Parser->>Embedder: Generate embedding
Embedder->>Embedder: Local model
Embedder->>Qdrant: Store vector + metadata
end
Qdrant->>Git: Complete
Core Components
1. File Watcher
Monitors git operations and triggers indexing:
python
# indexer/watcher.py
class FileWatcher:
def __init__(self, repository_path):
self.repo = git.Repo(repository_path)
self.last_commit = self.repo.head.commit
def get_changed_files(self):
"""Get files changed since last index"""
current_commit = self.repo.head.commit
diff = current_commit.diff(self.last_commit)
changed_files = []
for diff_item in diff:
if self._should_index(diff_item.a_path):
changed_files.append(diff_item.a_path)
self.last_commit = current_commit
return changed_files
def _should_index(self, file_path):
"""Filter indexable files"""
extensions = ['.php', '.js', '.jsx', '.scss', '.css', '.json']
return any(file_path.endswith(ext) for ext in extensions)2. Code Parser
Extracts semantic components from source files:
python
# indexer/parser.py
class CodeParser:
def __init__(self):
self.parsers = {
'.php': PHPParser(),
'.js': JavaScriptParser(),
'.jsx': JavaScriptParser(),
'.scss': CSSParser(),
'.css': CSSParser()
}
def parse_file(self, file_path, content):
"""Parse file and extract components"""
ext = os.path.splitext(file_path)[1]
parser = self.parsers.get(ext)
if not parser:
return []
return parser.extract_components(content)
class PHPParser:
def extract_components(self, content):
"""Extract functions, classes, hooks from PHP"""
components = []
# Extract functions
pattern = r'function\s+(\w+)\s*\([^)]*\)\s*\{'
for match in re.finditer(pattern, content):
components.append({
'type': 'function',
'name': match.group(1),
'content': self._extract_function_body(content, match.start()),
'line': content[:match.start()].count('\n') + 1
})
# Extract classes
pattern = r'class\s+(\w+)(?:\s+extends\s+(\w+))?\s*\{'
for match in re.finditer(pattern, content):
components.append({
'type': 'class',
'name': match.group(1),
'extends': match.group(2),
'content': self._extract_class_body(content, match.start())
})
# Extract hooks
pattern = r'(add_action|add_filter)\s*\(\s*["\']([^"\']+)["\']'
for match in re.finditer(pattern, content):
components.append({
'type': 'hook',
'hook_type': match.group(1),
'hook_name': match.group(2),
'content': self._extract_hook_context(content, match.start())
})
return components3. Embedding Generator
Generates vector embeddings using local model:
python
# indexer/embedder.py
from sentence_transformers import SentenceTransformer
class Embedder:
def __init__(self, model_name='all-MiniLM-L6-v2'):
self.model = SentenceTransformer(model_name)
def generate_embedding(self, text):
"""Generate 384-dimensional vector"""
# Clean and prepare text
text = self._clean_text(text)
# Generate embedding
embedding = self.model.encode(text)
return embedding.tolist()
def batch_generate(self, texts, batch_size=32):
"""Generate embeddings in batches for efficiency"""
texts = [self._clean_text(t) for t in texts]
embeddings = self.model.encode(
texts,
batch_size=batch_size,
show_progress_bar=True
)
return [e.tolist() for e in embeddings]
def _clean_text(self, text):
"""Clean text for embedding"""
# Remove excessive whitespace
text = re.sub(r'\s+', ' ', text)
# Remove code comments
text = re.sub(r'/\*.*?\*/', '', text, flags=re.DOTALL)
text = re.sub(r'//.*?$', '', text, flags=re.MULTILINE)
return text.strip()4. Vector Storage
Stores embeddings in Qdrant with metadata:
python
# indexer/storage.py
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct
class VectorStorage:
def __init__(self, host='localhost', port=6333):
self.client = QdrantClient(host=host, port=port)
def ensure_collection(self, collection_name, vector_size=384):
"""Create collection if it doesn't exist"""
try:
self.client.get_collection(collection_name)
except:
self.client.create_collection(
collection_name=collection_name,
vectors_config=VectorParams(
size=vector_size,
distance=Distance.COSINE
)
)
def store_vectors(self, collection_name, vectors):
"""Store multiple vectors with metadata"""
points = []
for i, vector_data in enumerate(vectors):
point = PointStruct(
id=vector_data['id'],
vector=vector_data['embedding'],
payload={
'file_path': vector_data['file_path'],
'component_type': vector_data['type'],
'component_name': vector_data['name'],
'content': vector_data['content'],
'language': vector_data['language'],
'line_start': vector_data.get('line_start'),
'line_end': vector_data.get('line_end'),
'indexed_at': datetime.now().isoformat(),
'commit_hash': vector_data.get('commit_hash')
}
)
points.append(point)
self.client.upsert(
collection_name=collection_name,
points=points
)Indexing Pipeline
Main Indexing Flow
python
# indexer/indexer.py
class Indexer:
def __init__(self, project_name):
self.project_name = project_name
self.parser = CodeParser()
self.embedder = Embedder()
self.storage = VectorStorage()
def index_project(self, repository_path):
"""Index entire project"""
collection_name = f"project_{self.project_name}"
# Ensure collection exists
self.storage.ensure_collection(collection_name)
# Get all indexable files
files = self._get_indexable_files(repository_path)
# Process in batches
for batch in self._batch(files, batch_size=10):
self._index_batch(batch, collection_name)
def _index_batch(self, files, collection_name):
"""Index batch of files"""
all_vectors = []
for file_path in files:
# Read file
with open(file_path, 'r') as f:
content = f.read()
# Parse components
components = self.parser.parse_file(file_path, content)
# Generate embeddings
texts = [c['content'] for c in components]
embeddings = self.embedder.batch_generate(texts)
# Prepare vector data
for component, embedding in zip(components, embeddings):
all_vectors.append({
'id': self._generate_id(file_path, component),
'file_path': file_path,
'embedding': embedding,
**component
})
# Store all vectors
self.storage.store_vectors(collection_name, all_vectors)Optimization Strategies
1. Incremental Indexing
python
def index_changes_only(self, changed_files):
"""Only index changed files"""
collection_name = f"project_{self.project_name}"
# Delete old vectors for changed files
for file_path in changed_files:
self.storage.delete_by_filter(
collection_name,
{"file_path": file_path}
)
# Index new versions
self._index_batch(changed_files, collection_name)2. Caching
python
class CachedEmbedder(Embedder):
def __init__(self, cache_dir='.embedding_cache'):
super().__init__()
self.cache_dir = cache_dir
os.makedirs(cache_dir, exist_ok=True)
def generate_embedding(self, text):
"""Generate or retrieve from cache"""
cache_key = hashlib.md5(text.encode()).hexdigest()
cache_file = os.path.join(self.cache_dir, f"{cache_key}.pkl")
if os.path.exists(cache_file):
with open(cache_file, 'rb') as f:
return pickle.load(f)
embedding = super().generate_embedding(text)
with open(cache_file, 'wb') as f:
pickle.dump(embedding, f)
return embedding3. Parallel Processing
python
from concurrent.futures import ThreadPoolExecutor
def index_parallel(self, files, max_workers=4):
"""Index files in parallel"""
with ThreadPoolExecutor(max_workers=max_workers) as executor:
futures = [
executor.submit(self._index_file, f)
for f in files
]
for future in futures:
future.result()Performance Metrics
Indexing Speed
| Component | Time per File | Notes |
|---|---|---|
| File Read | ~5ms | Disk I/O |
| Parsing | ~20ms | Regex + AST |
| Embedding | ~50ms | Model inference |
| Storage | ~10ms | Qdrant insert |
| Total | ~85ms | Per file average |
Memory Usage
| Operation | Memory | Notes |
|---|---|---|
| Model Loading | 200MB | One-time |
| Batch Processing | 300MB | Per batch |
| Vector Storage | 50MB | Qdrant client |
| Peak Usage | 550MB | Maximum |
Configuration
Environment Variables
bash
# Embedding model
EMBEDDING_MODEL=all-MiniLM-L6-v2
# Batch size
INDEXING_BATCH_SIZE=32
# Parallel workers
INDEXING_WORKERS=4
# Cache directory
EMBEDDING_CACHE_DIR=.cache/embeddings
# Vector database
VECTOR_DB_HOST=localhost
VECTOR_DB_PORT=6333Monitoring
Logging
python
import logging
logger = logging.getLogger('indexer')
logger.setLevel(logging.INFO)
# Log indexing progress
logger.info(f"Indexing {len(files)} files for project {project_name}")
logger.info(f"Generated {len(embeddings)} embeddings")
logger.info(f"Stored {len(vectors)} vectors")Metrics Collection
python
class IndexerMetrics:
def __init__(self):
self.files_indexed = 0
self.vectors_created = 0
self.errors = 0
self.start_time = time.time()
def record_file(self, file_path, vectors_count):
self.files_indexed += 1
self.vectors_created += vectors_count
def get_stats(self):
duration = time.time() - self.start_time
return {
'files_indexed': self.files_indexed,
'vectors_created': self.vectors_created,
'errors': self.errors,
'duration': duration,
'files_per_second': self.files_indexed / duration
}See Also: