added lark to notebook
This commit is contained in:
parent
5a666079a4
commit
8297ae204c
|
|
@ -30,6 +30,7 @@ dev = [
|
||||||
"evidently>=0.7.20",
|
"evidently>=0.7.20",
|
||||||
"jupyter>=1.1.1",
|
"jupyter>=1.1.1",
|
||||||
"langfuse<=3",
|
"langfuse<=3",
|
||||||
|
"lark>=1.3.1",
|
||||||
"markdown>=3.10.2",
|
"markdown>=3.10.2",
|
||||||
"mteb>=2.8.8",
|
"mteb>=2.8.8",
|
||||||
"polars>=1.38.1",
|
"polars>=1.38.1",
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,868 @@
|
||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "9ea4c7c5",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# n00 Dual Index Ingestion v1\n",
|
||||||
|
"\n",
|
||||||
|
"This notebook implements a dual-index ingestion strategy for RAG:\n",
|
||||||
|
"- **Docs index** for conceptual documentation chunks\n",
|
||||||
|
"- **Code index** for code chunks with parent/child hierarchy\n",
|
||||||
|
"\n",
|
||||||
|
"It is designed to support query decomposition when there is no exact solution and the model must synthesize ideas from both docs and code."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "4f7602ef",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import os\n",
|
||||||
|
"import re\n",
|
||||||
|
"import uuid\n",
|
||||||
|
"from dataclasses import dataclass\n",
|
||||||
|
"from datetime import datetime, timezone\n",
|
||||||
|
"from pathlib import Path\n",
|
||||||
|
"from typing import Any, Dict, Iterable, List, Optional, Tuple\n",
|
||||||
|
"\n",
|
||||||
|
"import torch\n",
|
||||||
|
"import torch.nn.functional as F\n",
|
||||||
|
"from elasticsearch import Elasticsearch\n",
|
||||||
|
"from elasticsearch.helpers import bulk\n",
|
||||||
|
"from loguru import logger\n",
|
||||||
|
"from transformers import AutoConfig, AutoModel, AutoTokenizer\n",
|
||||||
|
"from src.config import PROJ_ROOT, RAW_DIR, OLLAMA_EMB_MODEL_NAME, ELASTICSEARCH_LOCAL_URL\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"DOCS_INDEX = os.getenv(\"ELASTICSEARCH_DOCS_INDEX\", \"avap_docs_v1\")\n",
|
||||||
|
"CODE_INDEX = os.getenv(\"ELASTICSEARCH_CODE_INDEX\", \"avap_code_v1\")\n",
|
||||||
|
"ES_URL = ELASTICSEARCH_LOCAL_URL\n",
|
||||||
|
"RAW_DOCS_DIR = RAW_DIR / \"docs\"\n",
|
||||||
|
"RAW_CODE_DIR = RAW_DIR / \"code\"\n",
|
||||||
|
"\n",
|
||||||
|
"config = AutoConfig.from_pretrained(OLLAMA_EMB_MODEL_NAME)\n",
|
||||||
|
"EMBEDDING_DIM = int(config.hidden_size)\n",
|
||||||
|
"logger.info(f\"Embedding model: {OLLAMA_EMB_MODEL_NAME}\")\n",
|
||||||
|
"logger.info(f\"Embedding dim: {EMBEDDING_DIM}\")\n",
|
||||||
|
"logger.info(f\"Raw docs dir: {RAW_DOCS_DIR}\")\n",
|
||||||
|
"logger.info(f\"Raw code dir: {RAW_CODE_DIR}\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "7dd24f58",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Domain models"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "7aa1acca",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"@dataclass(frozen=True)\n",
|
||||||
|
"class DocChunk:\n",
|
||||||
|
" chunk_id: str\n",
|
||||||
|
" doc_id: str\n",
|
||||||
|
" text: str\n",
|
||||||
|
" source: str\n",
|
||||||
|
" metadata: Dict[str, Any]\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"@dataclass(frozen=True)\n",
|
||||||
|
"class CodeChunk:\n",
|
||||||
|
" chunk_id: str\n",
|
||||||
|
" doc_id: str\n",
|
||||||
|
" text: str\n",
|
||||||
|
" source: str\n",
|
||||||
|
" metadata: Dict[str, Any]\n",
|
||||||
|
" parent_id: Optional[str]\n",
|
||||||
|
" chunk_level: str\n",
|
||||||
|
" language: str\n",
|
||||||
|
" start_line: int\n",
|
||||||
|
" end_line: int\n",
|
||||||
|
" chunk_role: str"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "ac97ae4e",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Utilities"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "a975c8ee",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def now_iso() -> str:\n",
|
||||||
|
" return datetime.now(timezone.utc).isoformat()\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"def clean_text(text: str) -> str:\n",
|
||||||
|
" text = text.replace(\"\\u00a0\", \" \")\n",
|
||||||
|
" text = re.sub(r\"\\s+\", \" \", text).strip()\n",
|
||||||
|
" return text\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"def token_window_chunking(\n",
|
||||||
|
" text: str,\n",
|
||||||
|
" tokenizer: AutoTokenizer,\n",
|
||||||
|
" chunk_size: int = 1000,\n",
|
||||||
|
" overlap: int = 150,\n",
|
||||||
|
") -> List[str]:\n",
|
||||||
|
" if chunk_size <= overlap:\n",
|
||||||
|
" raise ValueError(\"chunk_size must be greater than overlap\")\n",
|
||||||
|
"\n",
|
||||||
|
" token_ids = tokenizer.encode(text, add_special_tokens=False)\n",
|
||||||
|
" chunks: List[str] = []\n",
|
||||||
|
" start = 0\n",
|
||||||
|
"\n",
|
||||||
|
" while start < len(token_ids):\n",
|
||||||
|
" end = min(start + chunk_size, len(token_ids))\n",
|
||||||
|
" piece_ids = token_ids[start:end]\n",
|
||||||
|
" chunks.append(tokenizer.decode(piece_ids, skip_special_tokens=True))\n",
|
||||||
|
"\n",
|
||||||
|
" if end == len(token_ids):\n",
|
||||||
|
" break\n",
|
||||||
|
"\n",
|
||||||
|
" start = end - overlap\n",
|
||||||
|
"\n",
|
||||||
|
" return [chunk for chunk in chunks if chunk.strip()]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "cfe2bfa9",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Code chunking with parent/child hierarchy"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "33acb0ac",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def _line_windows(\n",
|
||||||
|
" lines: List[str],\n",
|
||||||
|
" max_lines: int,\n",
|
||||||
|
" overlap: int\n",
|
||||||
|
") -> List[Tuple[int, int, str]]:\n",
|
||||||
|
" if max_lines <= overlap:\n",
|
||||||
|
" raise ValueError(\"max_lines must be greater than overlap\")\n",
|
||||||
|
"\n",
|
||||||
|
" windows: List[Tuple[int, int, str]] = []\n",
|
||||||
|
" start = 0\n",
|
||||||
|
"\n",
|
||||||
|
" while start < len(lines):\n",
|
||||||
|
" end = min(start + max_lines, len(lines))\n",
|
||||||
|
" text = \"\\n\".join(lines[start:end]).strip()\n",
|
||||||
|
" if text:\n",
|
||||||
|
" windows.append((start + 1, end, text))\n",
|
||||||
|
"\n",
|
||||||
|
" if end == len(lines):\n",
|
||||||
|
" break\n",
|
||||||
|
"\n",
|
||||||
|
" start = end - overlap\n",
|
||||||
|
"\n",
|
||||||
|
" return windows\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"def infer_chunk_role(text: str) -> str:\n",
|
||||||
|
" lowered = text.lower()\n",
|
||||||
|
" if \"try:\" in lowered or \"except\" in lowered:\n",
|
||||||
|
" return \"error\"\n",
|
||||||
|
" if \"import \" in lowered or \"from \" in lowered:\n",
|
||||||
|
" return \"init\"\n",
|
||||||
|
" if \"print(\" in lowered or \"logger\" in lowered:\n",
|
||||||
|
" return \"io\"\n",
|
||||||
|
" return \"logic\"\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"def build_code_chunks(\n",
|
||||||
|
" code_text: str,\n",
|
||||||
|
" source: str,\n",
|
||||||
|
" metadata: Dict[str, Any],\n",
|
||||||
|
" language: str = \"unknown\",\n",
|
||||||
|
" parent_max_lines: int = 80,\n",
|
||||||
|
" parent_overlap: int = 12,\n",
|
||||||
|
" child_max_lines: int = 20,\n",
|
||||||
|
" child_overlap: int = 4,\n",
|
||||||
|
") -> List[CodeChunk]:\n",
|
||||||
|
" doc_id = str(metadata.get(\"doc_id\") or uuid.uuid4())\n",
|
||||||
|
" lines = code_text.splitlines()\n",
|
||||||
|
"\n",
|
||||||
|
" parent_windows = _line_windows(lines, parent_max_lines, parent_overlap)\n",
|
||||||
|
" chunks: List[CodeChunk] = []\n",
|
||||||
|
"\n",
|
||||||
|
" for parent_idx, (start_line, end_line, parent_text) in enumerate(parent_windows):\n",
|
||||||
|
" parent_id = f\"{doc_id}:p{parent_idx}\"\n",
|
||||||
|
" parent_chunk = CodeChunk(\n",
|
||||||
|
" chunk_id=parent_id,\n",
|
||||||
|
" doc_id=doc_id,\n",
|
||||||
|
" text=parent_text,\n",
|
||||||
|
" source=source,\n",
|
||||||
|
" metadata={\n",
|
||||||
|
" **metadata,\n",
|
||||||
|
" \"doc_id\": doc_id,\n",
|
||||||
|
" \"source_type\": \"code\",\n",
|
||||||
|
" \"updated_at\": now_iso(),\n",
|
||||||
|
" \"is_parent\": True,\n",
|
||||||
|
" },\n",
|
||||||
|
" parent_id=None,\n",
|
||||||
|
" chunk_level=\"parent\",\n",
|
||||||
|
" language=language,\n",
|
||||||
|
" start_line=start_line,\n",
|
||||||
|
" end_line=end_line,\n",
|
||||||
|
" chunk_role=\"logic\",\n",
|
||||||
|
" )\n",
|
||||||
|
" chunks.append(parent_chunk)\n",
|
||||||
|
"\n",
|
||||||
|
" parent_lines = parent_text.splitlines()\n",
|
||||||
|
" child_windows = _line_windows(parent_lines, child_max_lines, child_overlap)\n",
|
||||||
|
"\n",
|
||||||
|
" for child_idx, (child_start, child_end, child_text) in enumerate(child_windows):\n",
|
||||||
|
" global_start = start_line + child_start - 1\n",
|
||||||
|
" global_end = start_line + child_end - 1\n",
|
||||||
|
" child_id = f\"{parent_id}:c{child_idx}\"\n",
|
||||||
|
"\n",
|
||||||
|
" child_chunk = CodeChunk(\n",
|
||||||
|
" chunk_id=child_id,\n",
|
||||||
|
" doc_id=doc_id,\n",
|
||||||
|
" text=child_text,\n",
|
||||||
|
" source=source,\n",
|
||||||
|
" metadata={\n",
|
||||||
|
" **metadata,\n",
|
||||||
|
" \"doc_id\": doc_id,\n",
|
||||||
|
" \"source_type\": \"code\",\n",
|
||||||
|
" \"updated_at\": now_iso(),\n",
|
||||||
|
" \"is_parent\": False,\n",
|
||||||
|
" },\n",
|
||||||
|
" parent_id=parent_id,\n",
|
||||||
|
" chunk_level=\"child\",\n",
|
||||||
|
" language=language,\n",
|
||||||
|
" start_line=global_start,\n",
|
||||||
|
" end_line=global_end,\n",
|
||||||
|
" chunk_role=infer_chunk_role(child_text),\n",
|
||||||
|
" )\n",
|
||||||
|
" chunks.append(child_chunk)\n",
|
||||||
|
"\n",
|
||||||
|
" return chunks"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "27583e2b",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Build documentation chunks"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "5c452926",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def build_doc_chunks(\n",
|
||||||
|
" doc_text: str,\n",
|
||||||
|
" source: str,\n",
|
||||||
|
" metadata: Dict[str, Any],\n",
|
||||||
|
" tokenizer: AutoTokenizer,\n",
|
||||||
|
" chunk_size: int = 1000,\n",
|
||||||
|
" overlap: int = 150,\n",
|
||||||
|
") -> List[DocChunk]:\n",
|
||||||
|
" doc_id = str(metadata.get(\"doc_id\") or uuid.uuid4())\n",
|
||||||
|
" cleaned = clean_text(doc_text)\n",
|
||||||
|
" parts = token_window_chunking(\n",
|
||||||
|
" cleaned,\n",
|
||||||
|
" tokenizer=tokenizer,\n",
|
||||||
|
" chunk_size=chunk_size,\n",
|
||||||
|
" overlap=overlap,\n",
|
||||||
|
" )\n",
|
||||||
|
"\n",
|
||||||
|
" chunks: List[DocChunk] = []\n",
|
||||||
|
" for idx, part in enumerate(parts):\n",
|
||||||
|
" chunk_id = f\"{doc_id}:d{idx}\"\n",
|
||||||
|
" chunks.append(\n",
|
||||||
|
" DocChunk(\n",
|
||||||
|
" chunk_id=chunk_id,\n",
|
||||||
|
" doc_id=doc_id,\n",
|
||||||
|
" text=part,\n",
|
||||||
|
" source=source,\n",
|
||||||
|
" metadata={\n",
|
||||||
|
" **metadata,\n",
|
||||||
|
" \"doc_id\": doc_id,\n",
|
||||||
|
" \"source_type\": \"doc\",\n",
|
||||||
|
" \"updated_at\": now_iso(),\n",
|
||||||
|
" },\n",
|
||||||
|
" )\n",
|
||||||
|
" )\n",
|
||||||
|
"\n",
|
||||||
|
" return chunks"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "0a4ee80e",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Embedding utilities"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "be0e5adb",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"_EMBED_CACHE: Dict[str, Any] = {}\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"def get_embedder(model_name: str = OLLAMA_EMB_MODEL_NAME) -> Tuple[AutoTokenizer, AutoModel, str]:\n",
|
||||||
|
" if model_name in _EMBED_CACHE:\n",
|
||||||
|
" return _EMBED_CACHE[model_name]\n",
|
||||||
|
"\n",
|
||||||
|
" device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
|
||||||
|
" tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)\n",
|
||||||
|
" model = AutoModel.from_pretrained(model_name).to(device)\n",
|
||||||
|
" model.eval()\n",
|
||||||
|
"\n",
|
||||||
|
" _EMBED_CACHE[model_name] = (tokenizer, model, device)\n",
|
||||||
|
" return tokenizer, model, device\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"def embed_texts(texts: List[str], batch_size: int = 32) -> List[List[float]]:\n",
|
||||||
|
" tokenizer, model, device = get_embedder()\n",
|
||||||
|
" vectors: List[List[float]] = []\n",
|
||||||
|
"\n",
|
||||||
|
" for idx in range(0, len(texts), batch_size):\n",
|
||||||
|
" batch = texts[idx: idx + batch_size]\n",
|
||||||
|
" with torch.no_grad():\n",
|
||||||
|
" encoded = tokenizer(\n",
|
||||||
|
" batch,\n",
|
||||||
|
" padding=True,\n",
|
||||||
|
" truncation=True,\n",
|
||||||
|
" return_tensors=\"pt\",\n",
|
||||||
|
" ).to(device)\n",
|
||||||
|
" output = model(**encoded)\n",
|
||||||
|
" mask = encoded[\"attention_mask\"].unsqueeze(-1)\n",
|
||||||
|
" pooled = (output.last_hidden_state * mask).sum(1) / mask.sum(1).clamp(min=1e-9)\n",
|
||||||
|
" normalized = F.normalize(pooled, p=2, dim=1)\n",
|
||||||
|
" vectors.extend(normalized.cpu().tolist())\n",
|
||||||
|
"\n",
|
||||||
|
" return vectors"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "674b0561",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Elasticsearch mappings (separated indices)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "11040f31",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"DOCS_MAPPING = {\n",
|
||||||
|
" \"settings\": {\"index\": {\"number_of_shards\": 1}},\n",
|
||||||
|
" \"mappings\": {\n",
|
||||||
|
" \"properties\": {\n",
|
||||||
|
" \"doc_id\": {\"type\": \"keyword\"},\n",
|
||||||
|
" \"chunk_id\": {\"type\": \"keyword\"},\n",
|
||||||
|
" \"text\": {\"type\": \"text\"},\n",
|
||||||
|
" \"source\": {\"type\": \"keyword\"},\n",
|
||||||
|
" \"source_type\": {\"type\": \"keyword\"},\n",
|
||||||
|
" \"domain\": {\"type\": \"keyword\"},\n",
|
||||||
|
" \"version\": {\"type\": \"keyword\"},\n",
|
||||||
|
" \"tags\": {\"type\": \"keyword\"},\n",
|
||||||
|
" \"difficulty\": {\"type\": \"keyword\"},\n",
|
||||||
|
" \"updated_at\": {\"type\": \"date\"},\n",
|
||||||
|
" \"section_title\": {\"type\": \"keyword\"},\n",
|
||||||
|
" \"topic_path\": {\"type\": \"keyword\"},\n",
|
||||||
|
" \"metadata\": {\"type\": \"object\", \"enabled\": True},\n",
|
||||||
|
" \"embedding\": {\n",
|
||||||
|
" \"type\": \"dense_vector\",\n",
|
||||||
|
" \"dims\": EMBEDDING_DIM,\n",
|
||||||
|
" \"index\": True,\n",
|
||||||
|
" \"similarity\": \"cosine\",\n",
|
||||||
|
" },\n",
|
||||||
|
" }\n",
|
||||||
|
" },\n",
|
||||||
|
"}\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"CODE_MAPPING = {\n",
|
||||||
|
" \"settings\": {\"index\": {\"number_of_shards\": 1}},\n",
|
||||||
|
" \"mappings\": {\n",
|
||||||
|
" \"properties\": {\n",
|
||||||
|
" \"doc_id\": {\"type\": \"keyword\"},\n",
|
||||||
|
" \"chunk_id\": {\"type\": \"keyword\"},\n",
|
||||||
|
" \"example_id\": {\"type\": \"keyword\"},\n",
|
||||||
|
" \"example_title\": {\"type\": \"keyword\"},\n",
|
||||||
|
" \"parent_id\": {\"type\": \"keyword\"},\n",
|
||||||
|
" \"chunk_level\": {\"type\": \"keyword\"},\n",
|
||||||
|
" \"chunk_role\": {\"type\": \"keyword\"},\n",
|
||||||
|
" \"language\": {\"type\": \"keyword\"},\n",
|
||||||
|
" \"symbol_name\": {\"type\": \"keyword\"},\n",
|
||||||
|
" \"intent\": {\"type\": \"keyword\"},\n",
|
||||||
|
" \"dependencies\": {\"type\": \"keyword\"},\n",
|
||||||
|
" \"start_line\": {\"type\": \"integer\"},\n",
|
||||||
|
" \"end_line\": {\"type\": \"integer\"},\n",
|
||||||
|
" \"repo\": {\"type\": \"keyword\"},\n",
|
||||||
|
" \"file_path\": {\"type\": \"keyword\"},\n",
|
||||||
|
" \"source\": {\"type\": \"keyword\"},\n",
|
||||||
|
" \"source_type\": {\"type\": \"keyword\"},\n",
|
||||||
|
" \"domain\": {\"type\": \"keyword\"},\n",
|
||||||
|
" \"version\": {\"type\": \"keyword\"},\n",
|
||||||
|
" \"tags\": {\"type\": \"keyword\"},\n",
|
||||||
|
" \"difficulty\": {\"type\": \"keyword\"},\n",
|
||||||
|
" \"updated_at\": {\"type\": \"date\"},\n",
|
||||||
|
" \"text\": {\"type\": \"text\"},\n",
|
||||||
|
" \"metadata\": {\"type\": \"object\", \"enabled\": True},\n",
|
||||||
|
" \"embedding\": {\n",
|
||||||
|
" \"type\": \"dense_vector\",\n",
|
||||||
|
" \"dims\": EMBEDDING_DIM,\n",
|
||||||
|
" \"index\": True,\n",
|
||||||
|
" \"similarity\": \"cosine\",\n",
|
||||||
|
" },\n",
|
||||||
|
" }\n",
|
||||||
|
" },\n",
|
||||||
|
"}"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "8f78db43",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def get_es_client() -> Elasticsearch:\n",
|
||||||
|
" return Elasticsearch(\n",
|
||||||
|
" ES_URL,\n",
|
||||||
|
" request_timeout=90,\n",
|
||||||
|
" max_retries=5,\n",
|
||||||
|
" retry_on_timeout=True,\n",
|
||||||
|
" )\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"def recreate_index(es: Elasticsearch, index_name: str, mapping: Dict[str, Any]) -> None:\n",
|
||||||
|
" if es.indices.exists(index=index_name):\n",
|
||||||
|
" es.indices.delete(index=index_name)\n",
|
||||||
|
" logger.info(f\"Deleted existing index: {index_name}\")\n",
|
||||||
|
"\n",
|
||||||
|
" es.indices.create(index=index_name, body=mapping)\n",
|
||||||
|
" logger.info(f\"Created index: {index_name}\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "8a66d191",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Indexing functions"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "5db6e05e",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def index_doc_chunks(\n",
|
||||||
|
" es: Elasticsearch,\n",
|
||||||
|
" index_name: str,\n",
|
||||||
|
" chunks: List[DocChunk],\n",
|
||||||
|
" batch_size: int = 32,\n",
|
||||||
|
") -> None:\n",
|
||||||
|
" vectors = embed_texts([chunk.text for chunk in chunks], batch_size=batch_size)\n",
|
||||||
|
"\n",
|
||||||
|
" def actions() -> Iterable[Dict[str, Any]]:\n",
|
||||||
|
" for chunk, vector in zip(chunks, vectors):\n",
|
||||||
|
" yield {\n",
|
||||||
|
" \"_op_type\": \"index\",\n",
|
||||||
|
" \"_index\": index_name,\n",
|
||||||
|
" \"_id\": chunk.chunk_id,\n",
|
||||||
|
" \"_source\": {\n",
|
||||||
|
" \"doc_id\": chunk.doc_id,\n",
|
||||||
|
" \"chunk_id\": chunk.chunk_id,\n",
|
||||||
|
" \"text\": chunk.text,\n",
|
||||||
|
" \"source\": chunk.source,\n",
|
||||||
|
" \"source_type\": \"doc\",\n",
|
||||||
|
" \"domain\": chunk.metadata.get(\"domain\"),\n",
|
||||||
|
" \"version\": chunk.metadata.get(\"version\"),\n",
|
||||||
|
" \"tags\": chunk.metadata.get(\"tags\", []),\n",
|
||||||
|
" \"difficulty\": chunk.metadata.get(\"difficulty\"),\n",
|
||||||
|
" \"updated_at\": chunk.metadata.get(\"updated_at\"),\n",
|
||||||
|
" \"section_title\": chunk.metadata.get(\"section_title\"),\n",
|
||||||
|
" \"topic_path\": chunk.metadata.get(\"topic_path\"),\n",
|
||||||
|
" \"metadata\": chunk.metadata,\n",
|
||||||
|
" \"embedding\": vector,\n",
|
||||||
|
" },\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" bulk(es.options(request_timeout=180), actions())\n",
|
||||||
|
" logger.info(f\"Indexed {len(chunks)} doc chunks into {index_name}\")\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"def index_code_chunks(\n",
|
||||||
|
" es: Elasticsearch,\n",
|
||||||
|
" index_name: str,\n",
|
||||||
|
" chunks: List[CodeChunk],\n",
|
||||||
|
" batch_size: int = 32,\n",
|
||||||
|
") -> None:\n",
|
||||||
|
" vectors = embed_texts([chunk.text for chunk in chunks], batch_size=batch_size)\n",
|
||||||
|
"\n",
|
||||||
|
" def actions() -> Iterable[Dict[str, Any]]:\n",
|
||||||
|
" for chunk, vector in zip(chunks, vectors):\n",
|
||||||
|
" yield {\n",
|
||||||
|
" \"_op_type\": \"index\",\n",
|
||||||
|
" \"_index\": index_name,\n",
|
||||||
|
" \"_id\": chunk.chunk_id,\n",
|
||||||
|
" \"_source\": {\n",
|
||||||
|
" \"doc_id\": chunk.doc_id,\n",
|
||||||
|
" \"chunk_id\": chunk.chunk_id,\n",
|
||||||
|
" \"example_id\": chunk.metadata.get(\"example_id\"),\n",
|
||||||
|
" \"example_title\": chunk.metadata.get(\"example_title\"),\n",
|
||||||
|
" \"parent_id\": chunk.parent_id,\n",
|
||||||
|
" \"chunk_level\": chunk.chunk_level,\n",
|
||||||
|
" \"chunk_role\": chunk.chunk_role,\n",
|
||||||
|
" \"language\": chunk.language,\n",
|
||||||
|
" \"symbol_name\": chunk.metadata.get(\"symbol_name\"),\n",
|
||||||
|
" \"intent\": chunk.metadata.get(\"intent\"),\n",
|
||||||
|
" \"dependencies\": chunk.metadata.get(\"dependencies\", []),\n",
|
||||||
|
" \"start_line\": chunk.start_line,\n",
|
||||||
|
" \"end_line\": chunk.end_line,\n",
|
||||||
|
" \"repo\": chunk.metadata.get(\"repo\"),\n",
|
||||||
|
" \"file_path\": chunk.metadata.get(\"file_path\"),\n",
|
||||||
|
" \"source\": chunk.source,\n",
|
||||||
|
" \"source_type\": \"code\",\n",
|
||||||
|
" \"domain\": chunk.metadata.get(\"domain\"),\n",
|
||||||
|
" \"version\": chunk.metadata.get(\"version\"),\n",
|
||||||
|
" \"tags\": chunk.metadata.get(\"tags\", []),\n",
|
||||||
|
" \"difficulty\": chunk.metadata.get(\"difficulty\"),\n",
|
||||||
|
" \"updated_at\": chunk.metadata.get(\"updated_at\"),\n",
|
||||||
|
" \"text\": chunk.text,\n",
|
||||||
|
" \"metadata\": chunk.metadata,\n",
|
||||||
|
" \"embedding\": vector,\n",
|
||||||
|
" },\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" bulk(es.options(request_timeout=180), actions())\n",
|
||||||
|
" logger.info(f\"Indexed {len(chunks)} code chunks into {index_name}\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "db3a45d4",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Ingest from data/raw/docs and data/raw/code"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "f0e3b3e4",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def _natural_sort_key(path: Path) -> Tuple[int, str]:\n",
|
||||||
|
" match = re.match(r\"^(\\d+)_\", path.stem)\n",
|
||||||
|
" if match:\n",
|
||||||
|
" return int(match.group(1)), path.stem.lower()\n",
|
||||||
|
" return 10**9, path.stem.lower()\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"def discover_raw_sources(\n",
|
||||||
|
" docs_dir: Path,\n",
|
||||||
|
" code_dir: Path,\n",
|
||||||
|
") -> Tuple[List[Path], List[Path]]:\n",
|
||||||
|
" if not docs_dir.exists():\n",
|
||||||
|
" raise FileNotFoundError(f\"Docs directory not found: {docs_dir}\")\n",
|
||||||
|
" if not code_dir.exists():\n",
|
||||||
|
" raise FileNotFoundError(f\"Code directory not found: {code_dir}\")\n",
|
||||||
|
"\n",
|
||||||
|
" docs_files = sorted(\n",
|
||||||
|
" [path for path in docs_dir.glob(\"*.txt\") if path.is_file()],\n",
|
||||||
|
" key=_natural_sort_key,\n",
|
||||||
|
" )\n",
|
||||||
|
" code_files = sorted(\n",
|
||||||
|
" [path for path in code_dir.glob(\"*.txt\") if path.is_file()],\n",
|
||||||
|
" key=_natural_sort_key,\n",
|
||||||
|
" )\n",
|
||||||
|
"\n",
|
||||||
|
" if not docs_files:\n",
|
||||||
|
" raise ValueError(f\"No .txt documentation files found in {docs_dir}\")\n",
|
||||||
|
" if not code_files:\n",
|
||||||
|
" raise ValueError(f\"No .txt code files found in {code_dir}\")\n",
|
||||||
|
"\n",
|
||||||
|
" return docs_files, code_files\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"def infer_example_title(prefix_text: str, fallback_title: str) -> str:\n",
|
||||||
|
" candidate_lines = [line.strip() for line in prefix_text.splitlines() if line.strip()]\n",
|
||||||
|
" for line in reversed(candidate_lines[-8:]):\n",
|
||||||
|
" lowered = line.lower()\n",
|
||||||
|
" if lowered == \"code snippet\":\n",
|
||||||
|
" continue\n",
|
||||||
|
" cleaned = re.sub(r\"^[#\\-\\s>*]*\", \"\", line)\n",
|
||||||
|
" cleaned = re.sub(r\"^\\d+[\\.)]\\s*\", \"\", cleaned)\n",
|
||||||
|
" cleaned = cleaned.replace(\"**\", \"\").strip()\n",
|
||||||
|
" if cleaned and cleaned.lower() != \"code snippet\":\n",
|
||||||
|
" return cleaned[:120]\n",
|
||||||
|
" return fallback_title\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"def extract_code_examples(file_text: str, file_stem: str) -> List[Dict[str, str]]:\n",
|
||||||
|
" pattern = re.compile(\n",
|
||||||
|
" r\"```(?P<lang>[a-zA-Z0-9_+-]*)\\n(?P<code>.*?)```\",\n",
|
||||||
|
" re.DOTALL,\n",
|
||||||
|
" )\n",
|
||||||
|
" examples: List[Dict[str, str]] = []\n",
|
||||||
|
"\n",
|
||||||
|
" for idx, match in enumerate(pattern.finditer(file_text), start=1):\n",
|
||||||
|
" code = match.group(\"code\").strip()\n",
|
||||||
|
" if not code:\n",
|
||||||
|
" continue\n",
|
||||||
|
" fallback_title = f\"{file_stem} - Example {idx:03d}\"\n",
|
||||||
|
" prefix = file_text[max(0, match.start() - 600): match.start()]\n",
|
||||||
|
" title = infer_example_title(prefix, fallback_title)\n",
|
||||||
|
" language = match.group(\"lang\").strip() or \"avap\"\n",
|
||||||
|
" examples.append({\n",
|
||||||
|
" \"example_index\": idx,\n",
|
||||||
|
" \"title\": title,\n",
|
||||||
|
" \"language\": language,\n",
|
||||||
|
" \"code\": code,\n",
|
||||||
|
" })\n",
|
||||||
|
"\n",
|
||||||
|
" if examples:\n",
|
||||||
|
" return examples\n",
|
||||||
|
"\n",
|
||||||
|
" plain_text = file_text.strip()\n",
|
||||||
|
" if not plain_text:\n",
|
||||||
|
" return []\n",
|
||||||
|
"\n",
|
||||||
|
" return [\n",
|
||||||
|
" {\n",
|
||||||
|
" \"example_index\": 1,\n",
|
||||||
|
" \"title\": f\"{file_stem} - Example 001\",\n",
|
||||||
|
" \"language\": \"avap\",\n",
|
||||||
|
" \"code\": plain_text,\n",
|
||||||
|
" }\n",
|
||||||
|
" ]\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"def build_doc_chunks_from_raw(\n",
|
||||||
|
" docs_files: List[Path],\n",
|
||||||
|
" tokenizer: AutoTokenizer,\n",
|
||||||
|
" chunk_size: int = 900,\n",
|
||||||
|
" overlap: int = 120,\n",
|
||||||
|
") -> List[DocChunk]:\n",
|
||||||
|
" all_chunks: List[DocChunk] = []\n",
|
||||||
|
"\n",
|
||||||
|
" for idx, path in enumerate(docs_files, start=1):\n",
|
||||||
|
" title = path.stem.replace(\"_\", \" \")\n",
|
||||||
|
" text = path.read_text(encoding=\"utf-8\")\n",
|
||||||
|
"\n",
|
||||||
|
" chunks = build_doc_chunks(\n",
|
||||||
|
" doc_text=text,\n",
|
||||||
|
" source=str(path.relative_to(PROJ_ROOT)),\n",
|
||||||
|
" metadata={\n",
|
||||||
|
" \"doc_id\": f\"doc-{idx:03d}\",\n",
|
||||||
|
" \"domain\": \"avap\",\n",
|
||||||
|
" \"version\": \"v1\",\n",
|
||||||
|
" \"tags\": [\"documentation\"],\n",
|
||||||
|
" \"difficulty\": \"mixed\",\n",
|
||||||
|
" \"section_title\": title,\n",
|
||||||
|
" \"topic_path\": f\"docs/{path.stem}\",\n",
|
||||||
|
" },\n",
|
||||||
|
" tokenizer=tokenizer,\n",
|
||||||
|
" chunk_size=chunk_size,\n",
|
||||||
|
" overlap=overlap,\n",
|
||||||
|
" )\n",
|
||||||
|
" all_chunks.extend(chunks)\n",
|
||||||
|
"\n",
|
||||||
|
" return all_chunks\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"def build_code_chunks_from_raw(\n",
|
||||||
|
" code_files: List[Path],\n",
|
||||||
|
" child_max_lines: int = 14,\n",
|
||||||
|
" child_overlap: int = 3,\n",
|
||||||
|
") -> List[CodeChunk]:\n",
|
||||||
|
" all_chunks: List[CodeChunk] = []\n",
|
||||||
|
" example_counter = 0\n",
|
||||||
|
"\n",
|
||||||
|
" for code_file in code_files:\n",
|
||||||
|
" file_text = code_file.read_text(encoding=\"utf-8\")\n",
|
||||||
|
" examples = extract_code_examples(file_text, code_file.stem)\n",
|
||||||
|
"\n",
|
||||||
|
" for example in examples:\n",
|
||||||
|
" example_counter += 1\n",
|
||||||
|
" code_text = example[\"code\"]\n",
|
||||||
|
" line_count = max(1, len(code_text.splitlines()))\n",
|
||||||
|
" child_window = min(child_max_lines, line_count)\n",
|
||||||
|
" overlap_window = min(child_overlap, max(0, child_window - 1))\n",
|
||||||
|
"\n",
|
||||||
|
" code_chunks = build_code_chunks(\n",
|
||||||
|
" code_text=code_text,\n",
|
||||||
|
" source=str(code_file.relative_to(PROJ_ROOT)),\n",
|
||||||
|
" metadata={\n",
|
||||||
|
" \"doc_id\": f\"code-{example_counter:03d}\",\n",
|
||||||
|
" \"example_id\": f\"example-{example_counter:03d}\",\n",
|
||||||
|
" \"example_title\": example[\"title\"],\n",
|
||||||
|
" \"repo\": \"BRUNIX-AI/assistance-engine\",\n",
|
||||||
|
" \"file_path\": str(code_file.relative_to(PROJ_ROOT)),\n",
|
||||||
|
" \"symbol_name\": example[\"title\"],\n",
|
||||||
|
" \"intent\": \"avap code example\",\n",
|
||||||
|
" \"dependencies\": [],\n",
|
||||||
|
" \"domain\": \"avap\",\n",
|
||||||
|
" \"version\": \"v1\",\n",
|
||||||
|
" \"tags\": [\"code-example\"],\n",
|
||||||
|
" \"difficulty\": \"mixed\",\n",
|
||||||
|
" },\n",
|
||||||
|
" language=example[\"language\"],\n",
|
||||||
|
" parent_max_lines=line_count,\n",
|
||||||
|
" parent_overlap=0,\n",
|
||||||
|
" child_max_lines=child_window,\n",
|
||||||
|
" child_overlap=overlap_window,\n",
|
||||||
|
" )\n",
|
||||||
|
" all_chunks.extend(code_chunks)\n",
|
||||||
|
"\n",
|
||||||
|
" return all_chunks\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"tokenizer, _, _ = get_embedder()\n",
|
||||||
|
"docs_files, code_files = discover_raw_sources(RAW_DOCS_DIR, RAW_CODE_DIR)\n",
|
||||||
|
"\n",
|
||||||
|
"doc_chunks = build_doc_chunks_from_raw(docs_files, tokenizer=tokenizer)\n",
|
||||||
|
"code_chunks = build_code_chunks_from_raw(code_files)\n",
|
||||||
|
"\n",
|
||||||
|
"print(f\"Documentation files discovered: {len(docs_files)}\")\n",
|
||||||
|
"print(f\"Code files discovered: {len(code_files)}\")\n",
|
||||||
|
"print(f\"Doc chunks built: {len(doc_chunks)}\")\n",
|
||||||
|
"print(f\"Code chunks built (parent + child): {len(code_chunks)}\")\n",
|
||||||
|
"print(f\"Parent chunks: {sum(1 for chunk in code_chunks if chunk.chunk_level == 'parent')}\")\n",
|
||||||
|
"print(f\"Child chunks: {sum(1 for chunk in code_chunks if chunk.chunk_level == 'child')}\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "573ead0e",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"es = get_es_client()\n",
|
||||||
|
"recreate_index(es, DOCS_INDEX, DOCS_MAPPING)\n",
|
||||||
|
"recreate_index(es, CODE_INDEX, CODE_MAPPING)\n",
|
||||||
|
"\n",
|
||||||
|
"index_doc_chunks(es, DOCS_INDEX, doc_chunks, batch_size=16)\n",
|
||||||
|
"index_code_chunks(es, CODE_INDEX, code_chunks, batch_size=16)\n",
|
||||||
|
"\n",
|
||||||
|
"es.indices.refresh(index=DOCS_INDEX)\n",
|
||||||
|
"es.indices.refresh(index=CODE_INDEX)\n",
|
||||||
|
"\n",
|
||||||
|
"print(\"Dual index ingestion completed from data/raw.\")\n",
|
||||||
|
"print(f\"Docs index: {DOCS_INDEX}\")\n",
|
||||||
|
"print(f\"Code index: {CODE_INDEX}\")\n",
|
||||||
|
"print(f\"Indexed doc chunks: {len(doc_chunks)}\")\n",
|
||||||
|
"print(f\"Indexed code chunks: {len(code_chunks)}\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "35f6cdee",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Optional: hybrid retrieval across both indices"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "968a8727",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def vector_search(\n",
|
||||||
|
" es: Elasticsearch,\n",
|
||||||
|
" index_name: str,\n",
|
||||||
|
" query: str,\n",
|
||||||
|
" top_k: int = 5,\n",
|
||||||
|
") -> List[Dict[str, Any]]:\n",
|
||||||
|
" query_vec = embed_texts([query], batch_size=1)[0]\n",
|
||||||
|
"\n",
|
||||||
|
" body = {\n",
|
||||||
|
" \"size\": top_k,\n",
|
||||||
|
" \"query\": {\n",
|
||||||
|
" \"script_score\": {\n",
|
||||||
|
" \"query\": {\"match_all\": {}},\n",
|
||||||
|
" \"script\": {\n",
|
||||||
|
" \"source\": \"cosineSimilarity(params.query_vector, 'embedding') + 1.0\",\n",
|
||||||
|
" \"params\": {\"query_vector\": query_vec},\n",
|
||||||
|
" },\n",
|
||||||
|
" }\n",
|
||||||
|
" },\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" result = es.search(index=index_name, body=body)\n",
|
||||||
|
" return result[\"hits\"][\"hits\"]\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"query = \"How do I handle normalization and edge cases in vector similarity?\"\n",
|
||||||
|
"doc_hits = vector_search(es, DOCS_INDEX, query, top_k=3)\n",
|
||||||
|
"code_hits = vector_search(es, CODE_INDEX, query, top_k=5)\n",
|
||||||
|
"\n",
|
||||||
|
"print(\"Top docs:\")\n",
|
||||||
|
"for hit in doc_hits:\n",
|
||||||
|
" print(\"-\", hit[\"_source\"].get(\"section_title\"), \"|\", hit[\"_score\"])\n",
|
||||||
|
"\n",
|
||||||
|
"print(\"\\nTop code chunks:\")\n",
|
||||||
|
"for hit in code_hits:\n",
|
||||||
|
" source = hit[\"_source\"]\n",
|
||||||
|
" print(\n",
|
||||||
|
" \"-\",\n",
|
||||||
|
" source.get(\"chunk_id\"),\n",
|
||||||
|
" \"| level=\",\n",
|
||||||
|
" source.get(\"chunk_level\"),\n",
|
||||||
|
" \"| parent=\",\n",
|
||||||
|
" source.get(\"parent_id\"),\n",
|
||||||
|
" \"| score=\",\n",
|
||||||
|
" hit.get(\"_score\"),\n",
|
||||||
|
" )"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"language_info": {
|
||||||
|
"name": "python"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
||||||
File diff suppressed because one or more lines are too long
4
uv.lock
4
uv.lock
|
|
@ -1,5 +1,5 @@
|
||||||
version = 1
|
version = 1
|
||||||
revision = 3
|
revision = 2
|
||||||
requires-python = ">=3.11"
|
requires-python = ">=3.11"
|
||||||
resolution-markers = [
|
resolution-markers = [
|
||||||
"python_full_version >= '3.14' and sys_platform == 'win32'",
|
"python_full_version >= '3.14' and sys_platform == 'win32'",
|
||||||
|
|
@ -293,6 +293,7 @@ dev = [
|
||||||
{ name = "evidently" },
|
{ name = "evidently" },
|
||||||
{ name = "jupyter" },
|
{ name = "jupyter" },
|
||||||
{ name = "langfuse" },
|
{ name = "langfuse" },
|
||||||
|
{ name = "lark" },
|
||||||
{ name = "markdown" },
|
{ name = "markdown" },
|
||||||
{ name = "mteb" },
|
{ name = "mteb" },
|
||||||
{ name = "polars" },
|
{ name = "polars" },
|
||||||
|
|
@ -330,6 +331,7 @@ dev = [
|
||||||
{ name = "evidently", specifier = ">=0.7.20" },
|
{ name = "evidently", specifier = ">=0.7.20" },
|
||||||
{ name = "jupyter", specifier = ">=1.1.1" },
|
{ name = "jupyter", specifier = ">=1.1.1" },
|
||||||
{ name = "langfuse", specifier = "<=3" },
|
{ name = "langfuse", specifier = "<=3" },
|
||||||
|
{ name = "lark", specifier = ">=1.3.1" },
|
||||||
{ name = "markdown", specifier = ">=3.10.2" },
|
{ name = "markdown", specifier = ">=3.10.2" },
|
||||||
{ name = "mteb", specifier = ">=2.8.8" },
|
{ name = "mteb", specifier = ">=2.8.8" },
|
||||||
{ name = "polars", specifier = ">=1.38.1" },
|
{ name = "polars", specifier = ">=1.38.1" },
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue