feat: add chunking methods and ingestion process for Elasticsearch

2026-02-18 14:51:52 +01:00 · 2026-02-18 14:51:52 +01:00 · 26603a9f45
parent f2482cae19
commit 26603a9f45
2 changed files with 396 additions and 0 deletions
--- a/scratches/pseco/ingestion/n00
+++ b/scratches/pseco/ingestion/n00
--- a/scratches/pseco/ingestion/n00
+++ b/scratches/pseco/ingestion/n00
@ -0,0 +1,396 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "0a8abbfa",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import os\n",
    "import re\n",
    "import uuid\n",
    "from dataclasses import dataclass\n",
    "from typing import Iterable, List, Dict, Any, Callable, Protocol\n",
    "\n",
    "import torch\n",
    "import torch.nn.functional as F\n",
    "from loguru import logger\n",
    "from transformers import AutoTokenizer, AutoModel\n",
    "from elasticsearch import Elasticsearch\n",
    "from elasticsearch.helpers import bulk\n",
    "import nltk\n",
    "from nltk.tokenize import sent_tokenize\n",
    "nltk.download(\"punkt\", quiet=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "77f6c552",
   "metadata": {},
   "source": [
    "### Domain model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "c4cd2bc2",
   "metadata": {},
   "outputs": [],
   "source": [
    "@dataclass(frozen=True)\n",
    "class Chunk:\n",
    "    doc_id: str\n",
    "    chunk_id: int\n",
    "    text: str\n",
    "    source: str\n",
    "    metadata: Dict[str, Any]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5cd700bd",
   "metadata": {},
   "source": [
    "### Utilities"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "84e834d9",
   "metadata": {},
   "outputs": [],
   "source": [
    "def clean_text(text: str) -> str:\n",
    "    text = text.replace(\"\\u00a0\", \" \")\n",
    "    text = re.sub(r\"\\s+\", \" \", text).strip()\n",
    "    return text"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "4ebdc5f5",
   "metadata": {},
   "outputs": [],
   "source": [
    "class ChunkingStrategy(Protocol):\n",
    "    def __call__(self, text: str, **kwargs) -> List[str]:\n",
    "        ..."
   ]
  },
  {
   "cell_type": "markdown",
   "id": "82209fc0",
   "metadata": {},
   "source": [
    "### Chunking strategies"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "9f360449",
   "metadata": {},
   "outputs": [],
   "source": [
    "def fixed_size_token_chunking(\n",
    "    text: str,\n",
    "    embedding_model_name: str = os.getenv(\"EMBEDDING_MODEL_NAME\"),\n",
    "    chunk_size: int = 1200,\n",
    "    overlap: int = 200,\n",
    ") -> List[str]:\n",
    "\n",
    "    if chunk_size <= overlap:\n",
    "        raise ValueError(\"chunk_size must be greater than overlap\")\n",
    "\n",
    "    tokenizer = AutoTokenizer.from_pretrained(embedding_model_name, use_fast=True)\n",
    "    token_ids = tokenizer.encode(text, add_special_tokens=False)\n",
    "\n",
    "    chunks: List[str] = []\n",
    "    start = 0\n",
    "    n = len(token_ids)\n",
    "\n",
    "    while start < n:\n",
    "        end = min(start + chunk_size, n)\n",
    "        chunk_ids = token_ids[start:end]\n",
    "        chunks.append(tokenizer.decode(chunk_ids, skip_special_tokens=True))\n",
    "\n",
    "        if end == n:\n",
    "            break\n",
    "\n",
    "        start = end - overlap\n",
    "\n",
    "    return chunks\n",
    "\n",
    "\n",
    "def semantic_chunking(\n",
    "    text: str,\n",
    "    embedding_model_name: str = os.getenv(\"EMBEDDING_MODEL_NAME\"),\n",
    "    similarity_threshold: float = 0.78,\n",
    "    max_sentences_per_chunk: int = 12,\n",
    ") -> List[str]:\n",
    "    sentences = [s.strip() for s in sent_tokenize(text) if s.strip()]\n",
    "    if not sentences:\n",
    "        return []\n",
    "    print(f\"Semantic chunking: {len(sentences)} sentences found\")\n",
    "    device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
    "\n",
    "    tokenizer = AutoTokenizer.from_pretrained(embedding_model_name)\n",
    "    model = AutoModel.from_pretrained(embedding_model_name).to(device)\n",
    "    model.eval()\n",
    "\n",
    "    with torch.no_grad():\n",
    "        enc = tokenizer(sentences, padding=True, truncation=True, return_tensors=\"pt\").to(device)\n",
    "        out = model(**enc)\n",
    "        mask = enc[\"attention_mask\"].unsqueeze(-1)\n",
    "        vecs = (out.last_hidden_state * mask).sum(1) / mask.sum(1).clamp(min=1e-9)\n",
    "        vecs = F.normalize(vecs, p=2, dim=1)\n",
    "\n",
    "    chunks: List[List[str]] = [[sentences[0]]]\n",
    "\n",
    "    for i in range(1, len(sentences)):\n",
    "        sim = float((vecs[i - 1] * vecs[i]).sum())\n",
    "        if sim < similarity_threshold or len(chunks[-1]) >= max_sentences_per_chunk:\n",
    "            chunks.append([])\n",
    "        chunks[-1].append(sentences[i])\n",
    "\n",
    "    return [\" \".join(chunk) for chunk in chunks if chunk]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "bc7267d7",
   "metadata": {},
   "outputs": [],
   "source": [
    "CHUNKING_REGISTRY: Dict[str, ChunkingStrategy] = {\n",
    "    \"fixed\": fixed_size_token_chunking,\n",
    "    \"semantic\": semantic_chunking,\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "87f2f70c",
   "metadata": {},
   "outputs": [],
   "source": [
    "def build_chunks(\n",
    "    doc_text: str,\n",
    "    source: str,\n",
    "    metadata: Dict[str, Any],\n",
    "    chunking_strategy: str = \"fixed\",\n",
    "    **chunking_kwargs,\n",
    ") -> List[Chunk]:\n",
    "\n",
    "    if chunking_strategy not in CHUNKING_REGISTRY:\n",
    "        raise ValueError(\n",
    "            f\"Unknown chunking strategy '{chunking_strategy}'. \"\n",
    "            f\"Available: {list(CHUNKING_REGISTRY.keys())}\"\n",
    "        )\n",
    "\n",
    "    doc_id = metadata.get(\"doc_id\") or str(uuid.uuid4())\n",
    "    cleaned = clean_text(doc_text)\n",
    "\n",
    "    chunking_fn = CHUNKING_REGISTRY[chunking_strategy]\n",
    "\n",
    "    parts = chunking_fn(cleaned, **chunking_kwargs)\n",
    "\n",
    "    return [\n",
    "        Chunk(\n",
    "            doc_id=doc_id,\n",
    "            chunk_id=i,\n",
    "            text=part,\n",
    "            source=source,\n",
    "            metadata={**metadata, \"doc_id\": doc_id},\n",
    "        )\n",
    "        for i, part in enumerate(parts)\n",
    "        if part.strip()\n",
    "    ]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ba5649e9",
   "metadata": {},
   "source": [
    "### Ingestion in elasticsearch"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ff03c689",
   "metadata": {},
   "outputs": [],
   "source": [
    "def index_chunks(\n",
    "    es: Elasticsearch,\n",
    "    index_name: str,\n",
    "    model: SentenceTransformer,\n",
    "    chunks: List[Chunk],\n",
    "    batch_size: int = 64,\n",
    ") -> None:\n",
    "    def actions() -> Iterable[Dict[str, Any]]:\n",
    "        # Embed in batches for speed\n",
    "        for i in range(0, len(chunks), batch_size):\n",
    "            batch = chunks[i:i + batch_size]\n",
    "            texts = [c.text for c in batch]\n",
    "            vectors = model.encode(texts, normalize_embeddings=True).tolist()\n",
    "\n",
    "            for c, v in zip(batch, vectors):\n",
    "                yield {\n",
    "                    \"_op_type\": \"index\",\n",
    "                    \"_index\": index_name,\n",
    "                    \"_id\": f\"{c.doc_id}:{c.chunk_id}\",\n",
    "                    \"_source\": {\n",
    "                        \"doc_id\": c.doc_id,\n",
    "                        \"chunk_id\": c.chunk_id,\n",
    "                        \"text\": c.text,\n",
    "                        \"source\": c.source,\n",
    "                        \"metadata\": c.metadata,\n",
    "                        \"embedding\": v,\n",
    "                    },\n",
    "                }\n",
    "\n",
    "    bulk(es, actions(), request_timeout=120)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7bcf0c87",
   "metadata": {},
   "outputs": [],
   "source": [
    "es_url = os.environ.get(\"ES_URL\", \"http://localhost:9200\")\n",
    "es_user = os.environ.get(\"ES_USER\")\n",
    "es_pass = os.environ.get(\"ES_PASS\")\n",
    "index_name = \"my_docs_v1\"\n",
    "\n",
    "es = Elasticsearch(\n",
    "    es_url,\n",
    "    basic_auth=(es_user, es_pass) if es_user and es_pass else None,\n",
    "    request_timeout=60,\n",
    ")\n",
    "\n",
    "# Pick a model with dims matching your index mapping.\n",
    "model = SentenceTransformer(\"sentence-transformers/all-mpnet-base-v2\")  # 768 dims\n",
    "\n",
    "# Example document\n",
    "doc_text = \"\"\"\n",
    "This is a sample document. Replace this with your PDF/HTML extraction output.\n",
    "\"\"\"\n",
    "chunks = build_chunks(\n",
    "    doc_text=doc_text,\n",
    "    source=\"local_demo\",\n",
    "    metadata={\"title\": \"Demo\", \"doc_id\": \"demo-001\"},\n",
    ")\n",
    "\n",
    "index_chunks(es, index_name, model, chunks)\n",
    "es.indices.refresh(index=index_name)\n",
    "print(f\"Indexed {len(chunks)} chunks into {index_name}.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "b1ba8e85",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "b763a493689549a180ab815567520c0a",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Loading weights:   0%|          | 0/310 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# Example document\n",
    "doc_text = \"\"\"\n",
    "This is a sample document. Replace this with your PDF/HTML extraction output.\n",
    "\"\"\"\n",
    "chunks = build_chunks(\n",
    "    doc_text=doc_text,\n",
    "    source=\"local_demo\",\n",
    "    metadata={\"title\": \"Demo\", \"doc_id\": \"demo-001\"},\n",
    "    chunking_strategy=\"semantic\"\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "b2c52b38",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[Chunk(doc_id='demo-001', chunk_id=0, text='This is a sample document. Replace this with your PDF/HTML extraction output.', source='local_demo', metadata={'title': 'Demo', 'doc_id': 'demo-001'})]"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "chunks"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "daa57061",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "assistance-engine",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }