Skip to content

Indexer Internals

Deep dive into the indexing architecture, code parsing, embedding generation, and vector storage.

Architecture

%%{init: {'theme':'neutral'}}%%
sequenceDiagram
    participant Git
    participant Hook
    participant Indexer
    participant Parser
    participant Embedder
    participant Qdrant

    Git->>Hook: git commit
    Hook->>Indexer: Changed files
    Indexer->>Parser: Parse files
    Parser->>Parser: Extract components
    loop For each component
        Parser->>Embedder: Generate embedding
        Embedder->>Embedder: Local model
        Embedder->>Qdrant: Store vector + metadata
    end
    Qdrant->>Git: Complete

Core Components

1. File Watcher

Monitors git operations and triggers indexing:

python
# indexer/watcher.py
class FileWatcher:
    def __init__(self, repository_path):
        self.repo = git.Repo(repository_path)
        self.last_commit = self.repo.head.commit

    def get_changed_files(self):
        """Get files changed since last index"""
        current_commit = self.repo.head.commit
        diff = current_commit.diff(self.last_commit)

        changed_files = []
        for diff_item in diff:
            if self._should_index(diff_item.a_path):
                changed_files.append(diff_item.a_path)

        self.last_commit = current_commit
        return changed_files

    def _should_index(self, file_path):
        """Filter indexable files"""
        extensions = ['.php', '.js', '.jsx', '.scss', '.css', '.json']
        return any(file_path.endswith(ext) for ext in extensions)

2. Code Parser

Extracts semantic components from source files:

python
# indexer/parser.py
class CodeParser:
    def __init__(self):
        self.parsers = {
            '.php': PHPParser(),
            '.js': JavaScriptParser(),
            '.jsx': JavaScriptParser(),
            '.scss': CSSParser(),
            '.css': CSSParser()
        }

    def parse_file(self, file_path, content):
        """Parse file and extract components"""
        ext = os.path.splitext(file_path)[1]
        parser = self.parsers.get(ext)

        if not parser:
            return []

        return parser.extract_components(content)

class PHPParser:
    def extract_components(self, content):
        """Extract functions, classes, hooks from PHP"""
        components = []

        # Extract functions
        pattern = r'function\s+(\w+)\s*\([^)]*\)\s*\{'
        for match in re.finditer(pattern, content):
            components.append({
                'type': 'function',
                'name': match.group(1),
                'content': self._extract_function_body(content, match.start()),
                'line': content[:match.start()].count('\n') + 1
            })

        # Extract classes
        pattern = r'class\s+(\w+)(?:\s+extends\s+(\w+))?\s*\{'
        for match in re.finditer(pattern, content):
            components.append({
                'type': 'class',
                'name': match.group(1),
                'extends': match.group(2),
                'content': self._extract_class_body(content, match.start())
            })

        # Extract hooks
        pattern = r'(add_action|add_filter)\s*\(\s*["\']([^"\']+)["\']'
        for match in re.finditer(pattern, content):
            components.append({
                'type': 'hook',
                'hook_type': match.group(1),
                'hook_name': match.group(2),
                'content': self._extract_hook_context(content, match.start())
            })

        return components

3. Embedding Generator

Generates vector embeddings using local model:

python
# indexer/embedder.py
from sentence_transformers import SentenceTransformer

class Embedder:
    def __init__(self, model_name='all-MiniLM-L6-v2'):
        self.model = SentenceTransformer(model_name)

    def generate_embedding(self, text):
        """Generate 384-dimensional vector"""
        # Clean and prepare text
        text = self._clean_text(text)

        # Generate embedding
        embedding = self.model.encode(text)

        return embedding.tolist()

    def batch_generate(self, texts, batch_size=32):
        """Generate embeddings in batches for efficiency"""
        texts = [self._clean_text(t) for t in texts]
        embeddings = self.model.encode(
            texts,
            batch_size=batch_size,
            show_progress_bar=True
        )
        return [e.tolist() for e in embeddings]

    def _clean_text(self, text):
        """Clean text for embedding"""
        # Remove excessive whitespace
        text = re.sub(r'\s+', ' ', text)
        # Remove code comments
        text = re.sub(r'/\*.*?\*/', '', text, flags=re.DOTALL)
        text = re.sub(r'//.*?$', '', text, flags=re.MULTILINE)
        return text.strip()

4. Vector Storage

Stores embeddings in Qdrant with metadata:

python
# indexer/storage.py
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct

class VectorStorage:
    def __init__(self, host='localhost', port=6333):
        self.client = QdrantClient(host=host, port=port)

    def ensure_collection(self, collection_name, vector_size=384):
        """Create collection if it doesn't exist"""
        try:
            self.client.get_collection(collection_name)
        except:
            self.client.create_collection(
                collection_name=collection_name,
                vectors_config=VectorParams(
                    size=vector_size,
                    distance=Distance.COSINE
                )
            )

    def store_vectors(self, collection_name, vectors):
        """Store multiple vectors with metadata"""
        points = []

        for i, vector_data in enumerate(vectors):
            point = PointStruct(
                id=vector_data['id'],
                vector=vector_data['embedding'],
                payload={
                    'file_path': vector_data['file_path'],
                    'component_type': vector_data['type'],
                    'component_name': vector_data['name'],
                    'content': vector_data['content'],
                    'language': vector_data['language'],
                    'line_start': vector_data.get('line_start'),
                    'line_end': vector_data.get('line_end'),
                    'indexed_at': datetime.now().isoformat(),
                    'commit_hash': vector_data.get('commit_hash')
                }
            )
            points.append(point)

        self.client.upsert(
            collection_name=collection_name,
            points=points
        )

Indexing Pipeline

Main Indexing Flow

python
# indexer/indexer.py
class Indexer:
    def __init__(self, project_name):
        self.project_name = project_name
        self.parser = CodeParser()
        self.embedder = Embedder()
        self.storage = VectorStorage()

    def index_project(self, repository_path):
        """Index entire project"""
        collection_name = f"project_{self.project_name}"

        # Ensure collection exists
        self.storage.ensure_collection(collection_name)

        # Get all indexable files
        files = self._get_indexable_files(repository_path)

        # Process in batches
        for batch in self._batch(files, batch_size=10):
            self._index_batch(batch, collection_name)

    def _index_batch(self, files, collection_name):
        """Index batch of files"""
        all_vectors = []

        for file_path in files:
            # Read file
            with open(file_path, 'r') as f:
                content = f.read()

            # Parse components
            components = self.parser.parse_file(file_path, content)

            # Generate embeddings
            texts = [c['content'] for c in components]
            embeddings = self.embedder.batch_generate(texts)

            # Prepare vector data
            for component, embedding in zip(components, embeddings):
                all_vectors.append({
                    'id': self._generate_id(file_path, component),
                    'file_path': file_path,
                    'embedding': embedding,
                    **component
                })

        # Store all vectors
        self.storage.store_vectors(collection_name, all_vectors)

Optimization Strategies

1. Incremental Indexing

python
def index_changes_only(self, changed_files):
    """Only index changed files"""
    collection_name = f"project_{self.project_name}"

    # Delete old vectors for changed files
    for file_path in changed_files:
        self.storage.delete_by_filter(
            collection_name,
            {"file_path": file_path}
        )

    # Index new versions
    self._index_batch(changed_files, collection_name)

2. Caching

python
class CachedEmbedder(Embedder):
    def __init__(self, cache_dir='.embedding_cache'):
        super().__init__()
        self.cache_dir = cache_dir
        os.makedirs(cache_dir, exist_ok=True)

    def generate_embedding(self, text):
        """Generate or retrieve from cache"""
        cache_key = hashlib.md5(text.encode()).hexdigest()
        cache_file = os.path.join(self.cache_dir, f"{cache_key}.pkl")

        if os.path.exists(cache_file):
            with open(cache_file, 'rb') as f:
                return pickle.load(f)

        embedding = super().generate_embedding(text)

        with open(cache_file, 'wb') as f:
            pickle.dump(embedding, f)

        return embedding

3. Parallel Processing

python
from concurrent.futures import ThreadPoolExecutor

def index_parallel(self, files, max_workers=4):
    """Index files in parallel"""
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = [
            executor.submit(self._index_file, f)
            for f in files
        ]

        for future in futures:
            future.result()

Performance Metrics

Indexing Speed

ComponentTime per FileNotes
File Read~5msDisk I/O
Parsing~20msRegex + AST
Embedding~50msModel inference
Storage~10msQdrant insert
Total~85msPer file average

Memory Usage

OperationMemoryNotes
Model Loading200MBOne-time
Batch Processing300MBPer batch
Vector Storage50MBQdrant client
Peak Usage550MBMaximum

Configuration

Environment Variables

bash
# Embedding model
EMBEDDING_MODEL=all-MiniLM-L6-v2

# Batch size
INDEXING_BATCH_SIZE=32

# Parallel workers
INDEXING_WORKERS=4

# Cache directory
EMBEDDING_CACHE_DIR=.cache/embeddings

# Vector database
VECTOR_DB_HOST=localhost
VECTOR_DB_PORT=6333

Monitoring

Logging

python
import logging

logger = logging.getLogger('indexer')
logger.setLevel(logging.INFO)

# Log indexing progress
logger.info(f"Indexing {len(files)} files for project {project_name}")
logger.info(f"Generated {len(embeddings)} embeddings")
logger.info(f"Stored {len(vectors)} vectors")

Metrics Collection

python
class IndexerMetrics:
    def __init__(self):
        self.files_indexed = 0
        self.vectors_created = 0
        self.errors = 0
        self.start_time = time.time()

    def record_file(self, file_path, vectors_count):
        self.files_indexed += 1
        self.vectors_created += vectors_count

    def get_stats(self):
        duration = time.time() - self.start_time
        return {
            'files_indexed': self.files_indexed,
            'vectors_created': self.vectors_created,
            'errors': self.errors,
            'duration': duration,
            'files_per_second': self.files_indexed / duration
        }

See Also:

Released under the MIT License.