diff --git a/scratches/pseco/ingestion/n00 ingestion notebook.ipynb b/scratches/pseco/ingestion/n00 ingestion notebook.ipynb new file mode 100644 index 0000000..2025cf0 --- /dev/null +++ b/scratches/pseco/ingestion/n00 ingestion notebook.ipynb @@ -0,0 +1,248 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "f1400112", + "metadata": {}, + "source": [ + "# Libraries" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7d593da4", + "metadata": {}, + "outputs": [], + "source": [ + "from dataclasses import dataclass\n", + "from typing import List, Protocol, Callable, Dict, Any\n", + "import numpy as np\n", + "from sentence_transformers import SentenceTransformer\n", + "import tiktoken\n", + "from transformers import AutoTokenizer" + ] + }, + { + "cell_type": "markdown", + "id": "157bead3", + "metadata": {}, + "source": [ + "# Chunking Methods" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4555f236", + "metadata": {}, + "outputs": [], + "source": [ + "@dataclass\n", + "class Chunk:\n", + " text: str\n", + " meta: Dict[str, Any]\n", + "\n", + "\n", + "class Chunker(Protocol):\n", + " name: str\n", + "\n", + " def chunk(self, text: str, meta: Dict[str, Any]) -> List[Chunk]: ...\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "177e3b00", + "metadata": {}, + "outputs": [], + "source": [ + "class TokenChunker:\n", + " def __init__(\n", + " self, name: str, encoding_name=\"cl100k_base\", chunk_size=400, overlap=60\n", + " ):\n", + " self.name = name\n", + " self.enc = tiktoken.get_encoding(encoding_name)\n", + " self.chunk_size = chunk_size\n", + " self.overlap = overlap\n", + "\n", + " def chunk(self, text: str, meta: Dict[str, Any]) -> List[Chunk]:\n", + " tokens = self.enc.encode(text)\n", + " chunks = []\n", + " start = 0\n", + " i = 0\n", + " while start < len(tokens):\n", + " end = min(start + self.chunk_size, len(tokens))\n", + " piece_tokens = tokens[start:end]\n", + " piece_text = self.enc.decode(piece_tokens)\n", + " chunks.append(\n", + " Chunk(piece_text, {**meta, \"chunk_index\": i, \"method\": self.name})\n", + " )\n", + " i += 1\n", + " start = end - self.overlap\n", + " if start < 0:\n", + " start = 0\n", + " if start >= len(tokens):\n", + " break\n", + " return chunks\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9bda4930", + "metadata": {}, + "outputs": [], + "source": [ + "def cosine(a, b):\n", + " a = a / (np.linalg.norm(a) + 1e-12)\n", + " b = b / (np.linalg.norm(b) + 1e-12)\n", + " return float(np.dot(a, b))\n", + "\n", + "class SemanticChunker:\n", + " def __init__(\n", + " self,\n", + " name: str,\n", + " embedder: SentenceTransformer,\n", + " max_tokens_fn: Callable[[str], int],\n", + " max_tokens=400,\n", + " similarity_threshold=0.78,\n", + " min_unit_chars=80,\n", + " ):\n", + " self.name = name\n", + " self.embedder = embedder\n", + " self.max_tokens_fn = max_tokens_fn\n", + " self.max_tokens = max_tokens\n", + " self.similarity_threshold = similarity_threshold\n", + " self.min_unit_chars = min_unit_chars\n", + "\n", + " def _units(self, text: str) -> List[str]:\n", + " # Párrafos como unidad base (suele ser buen punto de partida)\n", + " parts = [p.strip() for p in text.split(\"\\n\\n\") if p.strip()]\n", + " # opcional: fusiona “párrafos enanos”\n", + " merged = []\n", + " buf = \"\"\n", + " for p in parts:\n", + " if len(buf) < self.min_unit_chars:\n", + " buf = (buf + \"\\n\\n\" + p).strip() if buf else p\n", + " else:\n", + " merged.append(buf)\n", + " buf = p\n", + " if buf:\n", + " merged.append(buf)\n", + " return merged\n", + "\n", + " def chunk(self, text: str, meta: Dict[str, Any]) -> List[Chunk]:\n", + " units = self._units(text)\n", + " if not units:\n", + " return []\n", + "\n", + " unit_embs = self.embedder.encode(units, normalize_embeddings=True)\n", + "\n", + " chunks: List[Chunk] = []\n", + " cur_texts = [units[0]]\n", + " cur_center = unit_embs[0]\n", + " cur_tokens = self.max_tokens_fn(units[0])\n", + " chunk_index = 0\n", + "\n", + " for i in range(1, len(units)):\n", + " u = units[i]\n", + " e = unit_embs[i]\n", + " u_tokens = self.max_tokens_fn(u)\n", + "\n", + " sim = float(np.dot(cur_center, e)) # ya normalizado\n", + " would_tokens = cur_tokens + u_tokens\n", + "\n", + " if sim >= self.similarity_threshold and would_tokens <= self.max_tokens:\n", + " cur_texts.append(u)\n", + " # actualiza “centro” como media (normalizada)\n", + " cur_center = cur_center + e\n", + " cur_center = cur_center / (np.linalg.norm(cur_center) + 1e-12)\n", + " cur_tokens = would_tokens\n", + " else:\n", + " chunks.append(Chunk(\"\\n\\n\".join(cur_texts), {**meta, \"chunk_index\": chunk_index, \"method\": self.name}))\n", + " chunk_index += 1\n", + " cur_texts = [u]\n", + " cur_center = e\n", + " cur_tokens = u_tokens\n", + "\n", + " chunks.append(Chunk(\"\\n\\n\".join(cur_texts), {**meta, \"chunk_index\": chunk_index, \"method\": self.name}))\n", + " return chunks" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6c0fbd36", + "metadata": {}, + "outputs": [], + "source": [ + "def run_chunking(texts: List[Dict[str, Any]], chunkers: List[Chunker]) -> Dict[str, List[Chunk]]:\n", + " \"\"\"\n", + " texts: lista de {\"text\": ..., \"meta\": {...}}\n", + " \"\"\"\n", + " results = {}\n", + " for ch in chunkers:\n", + " all_chunks = []\n", + " for doc in texts:\n", + " all_chunks.extend(ch.chunk(doc[\"text\"], doc.get(\"meta\", {})))\n", + " results[ch.name] = all_chunks\n", + " print(f\"{ch.name}: {len(all_chunks)} chunks\")\n", + " return results" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a9c72266", + "metadata": {}, + "outputs": [], + "source": [ + "enc = tiktoken.get_encoding(\"cl100k_base\")\n", + "def count_tokens(s: str) -> int:\n", + " return len(enc.encode(s))\n" + ] + }, + { + "cell_type": "markdown", + "id": "90e0a6d7", + "metadata": {}, + "source": [ + "### Prueba Chunk" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8a75f0e1", + "metadata": {}, + "outputs": [], + "source": [ + "embedder = SentenceTransformer(\"Qwen/Qwen3-Embedding-4B-GGUF\")\n", + "\n", + "token_chunker = TokenChunker(name=\"token_400_60\", chunk_size=400, overlap=60)\n", + "semantic_chunker = SemanticChunker(\n", + " name=\"semantic_400_thr0.78\",\n", + " embedder=embedder,\n", + " max_tokens_fn=count_tokens,\n", + " max_tokens=400,\n", + " similarity_threshold=0.78,\n", + ")\n", + "\n", + "texts = [\n", + " {\"text\": open(\"doc1.txt\", \"r\", encoding=\"utf-8\").read(), \"meta\": {\"source\": \"doc1.txt\"}},\n", + " {\"text\": open(\"doc2.txt\", \"r\", encoding=\"utf-8\").read(), \"meta\": {\"source\": \"doc2.txt\"}},\n", + "]\n", + "\n", + "chunked = run_chunking(texts, [token_chunker, semantic_chunker])\n" + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}