Merge branch 'mrh-online-dev' of github.com:BRUNIX-AI/assistance-engine into mrh-online-dev
This commit is contained in:
commit
ba4a1f1efc
|
|
@ -35,7 +35,7 @@ class BrunixEngine(brunix_pb2_grpc.AssistanceEngineServicer):
|
||||||
index_name=os.getenv("ELASTICSEARCH_INDEX"),
|
index_name=os.getenv("ELASTICSEARCH_INDEX"),
|
||||||
embedding=self.embeddings,
|
embedding=self.embeddings,
|
||||||
query_field="text",
|
query_field="text",
|
||||||
vector_query_field="vector",
|
vector_query_field="embedding",
|
||||||
)
|
)
|
||||||
self.graph = build_graph(
|
self.graph = build_graph(
|
||||||
llm=self.llm,
|
llm=self.llm,
|
||||||
|
|
|
||||||
45
README.md
45
README.md
|
|
@ -45,13 +45,15 @@ graph TD
|
||||||
|
|
||||||
├── README.md # System documentation & Dev guide
|
├── README.md # System documentation & Dev guide
|
||||||
├── changelog # Version tracking and release history
|
├── changelog # Version tracking and release history
|
||||||
├── pyproject.toml # Python project configuration
|
├── pyproject.toml
|
||||||
|
├── ingestion/ # Data ingested in Elasticsearch
|
||||||
├── docs/
|
├── docs/
|
||||||
| ├── AVAP Language: ... # AVAP DSL Documentation
|
│ ├── AVAP Language: ... # AVAP DSL Documentation
|
||||||
| | └── AVAP.md
|
│ │ └── AVAP.md
|
||||||
│ ├── developer.avapfr... # Documents on developer web page
|
│ ├── developer.avapfr... # Documents on developer web page
|
||||||
| └── LRM/ # AVAP LRM documentation
|
│ ├── LRM/ # AVAP LRM documentation
|
||||||
| └── avap.md
|
│ │ └── avap.md
|
||||||
|
│ └── samples/ # AVAP code samples
|
||||||
├── Docker/
|
├── Docker/
|
||||||
│ ├── protos/
|
│ ├── protos/
|
||||||
│ │ └── brunix.proto # Protocol Buffers: The source of truth for the API
|
│ │ └── brunix.proto # Protocol Buffers: The source of truth for the API
|
||||||
|
|
@ -64,30 +66,16 @@ graph TD
|
||||||
│ ├── Dockerfile # Container definition for the Engine
|
│ ├── Dockerfile # Container definition for the Engine
|
||||||
│ ├── docker-compose.yaml # Local orchestration for dev environment
|
│ ├── docker-compose.yaml # Local orchestration for dev environment
|
||||||
│ ├── requirements.txt # Python dependencies for Docker
|
│ ├── requirements.txt # Python dependencies for Docker
|
||||||
│ ├── protos/
|
│ └── .dockerignore # Docker ignore files
|
||||||
│ │ └── brunix.proto # Protocol Buffers: The source of truth for the API
|
|
||||||
│ └── src/
|
|
||||||
│ ├── graph.py # Workflow graph orchestration
|
|
||||||
│ ├── prompts.py # Centralized prompt definitions
|
|
||||||
│ ├── server.py # gRPC Server & RAG Orchestration
|
|
||||||
│ ├── state.py # Shared state management
|
|
||||||
│ └── utils/ # Utility modules
|
|
||||||
├── ingestion/
|
|
||||||
│ └── docs/ # AVAP documentation chunks
|
|
||||||
├── kubernetes/
|
|
||||||
│ └── kubeconfig.yaml # Kubernetes cluster configuration
|
|
||||||
├── scripts/
|
├── scripts/
|
||||||
│ └── pipelines/
|
│ └── pipelines/
|
||||||
| ├── samples_generator/ # AVAP Sample generator
|
│ ├── flows/ # Processing pipelines
|
||||||
| | └─ generate_mbap.py
|
│ └── tasks/ # Modules used by the flows
|
||||||
│ └── flows/ # Data processing flows
|
|
||||||
| └─ elasticsearch_ingestion.py
|
|
||||||
└── src/
|
└── src/
|
||||||
├── __init__.py
|
├── config.py # Environment variables configuration file
|
||||||
└── utils/
|
└── utils/
|
||||||
├── emb_factory.py # Embedding model factory
|
├── emb_factory.py # Embedding model factory
|
||||||
├── llm_factory.py # LLM model factory
|
└── llm_factory.py # LLM model factory
|
||||||
└── __init__.py
|
|
||||||
```
|
```
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
@ -146,6 +134,7 @@ The engine utilizes Langfuse for end-to-end tracing and performance monitoring.
|
||||||
Create a `.env` file in the project root with the following variables:
|
Create a `.env` file in the project root with the following variables:
|
||||||
|
|
||||||
```env
|
```env
|
||||||
|
PYTHONPATH=${PYTHONPATH}:/home/...
|
||||||
ELASTICSEARCH_URL=http://host.docker.internal:9200
|
ELASTICSEARCH_URL=http://host.docker.internal:9200
|
||||||
ELASTICSEARCH_LOCAL_URL=http://localhost:9200
|
ELASTICSEARCH_LOCAL_URL=http://localhost:9200
|
||||||
ELASTICSEARCH_INDEX=avap-docs-test
|
ELASTICSEARCH_INDEX=avap-docs-test
|
||||||
|
|
@ -157,11 +146,13 @@ OLLAMA_URL=http://host.docker.internal:11434
|
||||||
OLLAMA_LOCAL_URL=http://localhost:11434
|
OLLAMA_LOCAL_URL=http://localhost:11434
|
||||||
OLLAMA_MODEL_NAME=qwen2.5:1.5b
|
OLLAMA_MODEL_NAME=qwen2.5:1.5b
|
||||||
OLLAMA_EMB_MODEL_NAME=qwen3-0.6B-emb:latest
|
OLLAMA_EMB_MODEL_NAME=qwen3-0.6B-emb:latest
|
||||||
|
HF_TOKEN=hf_...
|
||||||
HF_EMB_MODEL_NAME=Qwen/Qwen3-Embedding-0.6B
|
HF_EMB_MODEL_NAME=Qwen/Qwen3-Embedding-0.6B
|
||||||
```
|
```
|
||||||
|
|
||||||
| Variable | Required | Description | Example |
|
| Variable | Required | Description | Example |
|
||||||
|---|---|---|---|
|
|---|---|---|---|
|
||||||
|
| `PYTHONPATH` | No | Path that aims to the root of the project | `${PYTHONPATH}:/home/...` |
|
||||||
| `ELASTICSEARCH_URL` | Yes | Elasticsearch endpoint used for vector/context retrieval in Docker | `http://host.docker.internal:9200` |
|
| `ELASTICSEARCH_URL` | Yes | Elasticsearch endpoint used for vector/context retrieval in Docker | `http://host.docker.internal:9200` |
|
||||||
| `ELASTICSEARCH_LOCAL_URL` | Yes | Elasticsearch endpoint used for vector/context retrieval in local | `http://localhost:9200` |
|
| `ELASTICSEARCH_LOCAL_URL` | Yes | Elasticsearch endpoint used for vector/context retrieval in local | `http://localhost:9200` |
|
||||||
| `ELASTICSEARCH_INDEX` | Yes | Elasticsearch index name used by the engine | `avap-docs-test` |
|
| `ELASTICSEARCH_INDEX` | Yes | Elasticsearch index name used by the engine | `avap-docs-test` |
|
||||||
|
|
@ -183,13 +174,13 @@ Open a terminal and establish the connection to the Devaron Cluster:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# 1. AI Model Tunnel (Ollama)
|
# 1. AI Model Tunnel (Ollama)
|
||||||
kubectl port-forward --address 0.0.0.0 svc/ollama-light-service 11434:11434 -n brunix --kubeconfig ./kubernetes/ivar.yaml &
|
kubectl port-forward --address 0.0.0.0 svc/ollama-light-service 11434:11434 -n brunix --kubeconfig ./kubernetes/kubeconfig.yaml &
|
||||||
|
|
||||||
# 2. Knowledge Base Tunnel (Elasticsearch)
|
# 2. Knowledge Base Tunnel (Elasticsearch)
|
||||||
kubectl port-forward --address 0.0.0.0 svc/brunix-vector-db 9200:9200 -n brunix --kubeconfig ./kubernetes/ivar.yaml &
|
kubectl port-forward --address 0.0.0.0 svc/brunix-vector-db 9200:9200 -n brunix --kubeconfig ./kubernetes/kubeconfig.yaml &
|
||||||
|
|
||||||
# 3. Observability DB Tunnel (PostgreSQL)
|
# 3. Observability DB Tunnel (PostgreSQL)
|
||||||
kubectl port-forward --address 0.0.0.0 svc/brunix-postgres 5432:5432 -n brunix --kubeconfig ./kubernetes/ivar.yaml &
|
kubectl port-forward --address 0.0.0.0 svc/brunix-postgres 5432:5432 -n brunix --kubeconfig ./kubernetes/kubeconfig.yaml &
|
||||||
```
|
```
|
||||||
|
|
||||||
### 5. Launch the Engine
|
### 5. Launch the Engine
|
||||||
|
|
|
||||||
|
|
@ -4,24 +4,23 @@ All notable changes to the **Brunix Assistance Engine** will be documented in th
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## [1.5.0] - 2026-03-11
|
## [1.5.0] - 2026-03-12
|
||||||
|
|
||||||
### Added
|
### Added
|
||||||
- IMPLEMENTED:
|
- IMPLEMENTED:
|
||||||
- `scripts/pipelines/flows/translate_mbpp.py`: pipeline to generate synthethic dataset from mbpp dataset.
|
- `scripts/pipelines/flows/translate_mbpp.py`: pipeline to generate synthethic dataset from mbpp dataset.
|
||||||
- `scripts/input/prompts.py`: module containing prompts for pipelines.
|
- `scripts/tasks/prompts.py`: module containing prompts for pipelines.
|
||||||
- `scripts/tasks/chunk.py`: module containing functions related to chunk management.
|
- `scripts/tasks/chunk.py`: module containing functions related to chunk management.
|
||||||
- `synthethic_datasets`: folder containing generated synthethic datasets.
|
- `synthethic_datasets`: folder containing generated synthethic datasets.
|
||||||
- `src/config.py`: environment variables configuration file.
|
- `src/config.py`: environment variables configuration file.
|
||||||
|
|
||||||
### Changed
|
### Changed
|
||||||
- REFACTORED: `scripts/pipelines/flows/elasticsearch_ingestion.py` now uses `docs` documents instead of pre chunked files.
|
- REFACTORED: `scripts/pipelines/flows/elasticsearch_ingestion.py` now uses `docs/LRM` or `docs/samples` documents instead of pre chunked files.
|
||||||
- RENAMED `docs/AVAP Language: Core Commands & Functional Specification` to `docs/avap_language_github_docs`.
|
- RENAMED `docs/AVAP Language: Core Commands & Functional Specification` to `docs/avap_language_github_docs`.
|
||||||
- REMOVED: `Makefile` file.
|
- REMOVED: `Makefile` file.
|
||||||
- REMOVED: `scripts/start-tunnels.sh` script.
|
- REMOVED: `scripts/start-tunnels.sh` script.
|
||||||
- REMOVED `ingestion` folder.
|
|
||||||
- DEPENDENCIES: `requirements.txt` updated with new libraries required by the new modules.
|
- DEPENDENCIES: `requirements.txt` updated with new libraries required by the new modules.
|
||||||
- MOVED `scripts/generate_mbpp_avap.py` into `scripts/flows/generate_mbpp_avap.py`
|
- MOVED `scripts/generate_mbap.py` into `scripts/flows/generate_mbap.py`.
|
||||||
|
|
||||||
|
|
||||||
## [1.4.0] - 2026-03-10
|
## [1.4.0] - 2026-03-10
|
||||||
|
|
|
||||||
2305
docs/AVAP_dev.md
2305
docs/AVAP_dev.md
File diff suppressed because it is too large
Load Diff
173
docs/LRM/avap.md
173
docs/LRM/avap.md
|
|
@ -115,7 +115,41 @@ AVAP utiliza una gramática estructural mixta. Combina la fluidez de las palabra
|
||||||
La estructura `if()` evalúa una expresión lógica o de comparación. Todo bloque condicional requiere un cierre explícito utilizando el comando `end()`.
|
La estructura `if()` evalúa una expresión lógica o de comparación. Todo bloque condicional requiere un cierre explícito utilizando el comando `end()`.
|
||||||
|
|
||||||
El comando `if()` soporta dos modos de invocación:
|
El comando `if()` soporta dos modos de invocación:
|
||||||
* **Modo 1 (comparación estructurada):** `if(variable, valor, comparador)` — evalúa la comparación entre variable y valor usando el operador indicado como string (ej. `"=="`, `">"`, `"!="`). Los dos primeros argumentos deben ser identificadores simples o literales, nunca expresiones de acceso como `dict['clave']`. Si se necesita comparar un valor extraído de una estructura, debe asignarse primero a una variable.* **Modo 2 (expresión libre):** `if(None, None, "expresion_compleja")` — evalúa directamente una expresión booleana compleja proporcionada como string.
|
* **Modo 1 (comparación estructurada):** `if(variable, valor, comparador)` — evalúa la comparación entre variable y valor usando el operador indicado como string (ej. `"=="`, `">"`, `"!="`). Los dos primeros argumentos deben ser identificadores simples o literales, nunca expresiones de acceso como `dict['clave']`. Si se necesita comparar un valor extraído de una estructura, debe asignarse primero a una variable.* **Modo 2 (expresión libre):** `if(None, None, expresion_compleja)` — evalúa directamente una expresión booleana compleja proporcionada como string encapsulado entre `.
|
||||||
|
|
||||||
|
## SECCIÓN III: Lógica de Control y Estructuras de Decisión
|
||||||
|
|
||||||
|
AVAP utiliza una gramática estructural mixta. Combina la fluidez de las palabras clave para abrir bloques funcionales con la seguridad matemática de cierres estrictos.
|
||||||
|
|
||||||
|
### 3.1 El Bloque Condicional (if() / else() / end())
|
||||||
|
El comando `if()` gestiona la lógica condicional mediante dos modos de invocación estrictamente diferenciados. Es imperativo respetar los delimitadores y la posición de los argumentos.
|
||||||
|
|
||||||
|
#### Modo 1: Comparación Estructurada (Atómica)
|
||||||
|
Se utiliza para comparaciones directas entre dos valores simples.
|
||||||
|
* **Sintaxis:** `if(átomo_1, átomo_2, "operador")`
|
||||||
|
* **Argumentos 1 y 2:** Deben ser identificadores simples (variables) o literales (strings/números). **No se permite el uso de `None` en este modo.**
|
||||||
|
* **Argumento 3:** El operador de comparación debe ir obligatoriamente entre **comillas dobles** (`"=="`, `"!="`, `">"`, `"<"`, `">="`, `"<="`).
|
||||||
|
* **Restricción:** No se permiten expresiones de acceso (ej. `data.user` o `list[0]`). Estos valores deben asignarse previamente a una variable.
|
||||||
|
* **Ejemplo correcto:** `if(reintentos, 5, "<")`
|
||||||
|
|
||||||
|
#### Modo 2: Expresión Libre (Evaluación Compleja)
|
||||||
|
Se utiliza para evaluar expresiones lógicas que no encajan en la estructura atómica.
|
||||||
|
* **Sintaxis:** `if(None, None, `expresión_compleja`)`
|
||||||
|
* **Argumentos 1 y 2:** Deben ser literalmente la palabra `None` (sin comillas).
|
||||||
|
* **Argumento 3:** La expresión completa **debe** estar encapsulada entre **acentos graves (backticks)**. Esto permite incluir lógica interna, operadores `and/or` y accesos a estructuras de datos.
|
||||||
|
* **Ejemplo correcto:** `if(None, None, `user.id > 10 and email.contains("@")`)`
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Tabla de Validación para el Modelo
|
||||||
|
|
||||||
|
| Entrada | Estado | Razón |
|
||||||
|
| :--- | :--- | :--- |
|
||||||
|
| `if(count, 10, "==")` | ✅ VÁLIDO | Modo 1: Átomos válidos y operador entre comillas. |
|
||||||
|
| `if(None, None, `val > 0`)` | ✅ VÁLIDO | Modo 2: Uso correcto de `None` y backticks. |
|
||||||
|
| `if(username, None, "==")` | ❌ ERROR | El Modo 1 prohíbe el uso de `None`. Debe usarse el Modo 2. |
|
||||||
|
| `if(None, None, "val > 0")` | ❌ ERROR | El Modo 2 requiere backticks (`` ` ``), no comillas. |
|
||||||
|
| `if(user.id, 10, "==")` | ❌ ERROR | El Modo 1 no permite expresiones de acceso (`.`). |
|
||||||
|
|
||||||
### 3.2 Iteraciones Estrictas y Deterministas (startLoop / endLoop)
|
### 3.2 Iteraciones Estrictas y Deterministas (startLoop / endLoop)
|
||||||
Para garantizar el determinismo y evitar el colapso de memoria:
|
Para garantizar el determinismo y evitar el colapso de memoria:
|
||||||
|
|
@ -137,15 +171,17 @@ Diseñada para proteger la estabilidad del servidor ante fallos de I/O.
|
||||||
[ "else()" <EOL> <block> ]
|
[ "else()" <EOL> <block> ]
|
||||||
"end()" <EOL>
|
"end()" <EOL>
|
||||||
|
|
||||||
/* if() soporta dos modos:
|
<if_condition> ::= <if_structured> | <if_free_expression>
|
||||||
Modo 1 — comparación estructurada: los dos primeros argumentos deben ser
|
|
||||||
identificadores simples o literales, nunca expresiones de acceso.
|
<if_structured> ::= "if" "(" <strict_atom> "," <strict_atom> "," <backtick_string> ")"
|
||||||
Si se necesita comparar un valor extraído de una estructura (ej. dict['clave']),
|
<if_free_expression> ::= "if" "(" "None" "," "None" "," <backtick_string> ")"
|
||||||
debe asignarse previamente a una variable.
|
|
||||||
Modo 2 — expresión libre: None, None, expresión compleja como string */
|
<strict_atom> ::= <identifier> | <non_null_literal>
|
||||||
<if_condition> ::= <if_atom> "," <if_atom> "," <stringliteral>
|
<backtick_string> ::= "`" <text_content> "`"
|
||||||
| "None" "," "None" "," <stringliteral>
|
|
||||||
<if_atom> ::= <identifier> | <literal>
|
<identifier> ::= [a-zA-Z_][a-zA-Z0-9_]*
|
||||||
|
<non_null_literal>::= <number> | <string_literal_double_quotes>
|
||||||
|
/* Nota: <non_null_literal> NO incluye la palabra "None" */
|
||||||
|
|
||||||
<loop_stmt> ::= "startLoop(" <identifier> "," <expression> "," <expression> ")" <EOL>
|
<loop_stmt> ::= "startLoop(" <identifier> "," <expression> "," <expression> ")" <EOL>
|
||||||
<block>
|
<block>
|
||||||
|
|
@ -261,59 +297,116 @@ AVAP utiliza `avapConnector("TOKEN")` para la hidratación segura de credenciale
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## SECCIÓN VI: Utilidades, Criptografía y Manipulación de Datos
|
# SECCIÓN VI: Utilidades, Criptografía y Manipulación de Datos
|
||||||
|
|
||||||
AVAP incluye un set de comandos integrados de alto nivel para manipular tipos complejos (JSON y Listas), tiempos, textos y generar hashes.
|
AVAP incluye un set de comandos integrados de alto nivel para manipular tipos complejos (JSON y Listas), tiempos, textos y generar hashes.
|
||||||
|
|
||||||
### 6.1 Manipulación Nativa de Listas y Objetos JSON
|
---
|
||||||
Para extraer y mutar estructuras complejas, AVAP provee comandos nativos específicos:
|
|
||||||
* **`variableToList(elemento, destino)`**: Fuerza a que una variable escalar se convierta en una estructura iterable de lista.
|
|
||||||
* **`itemFromList(lista_origen, indice, destino)`**: Extrae de forma segura el elemento contenido en la posición `indice` de una lista.
|
|
||||||
* **`variableFromJSON(json_origen, clave, destino)`**: Parsea un objeto JSON en memoria y extrae el valor correspondiente a la `clave`.
|
|
||||||
* **`AddVariableToJSON(clave, valor, json_destino)`**: Inyecta dinámicamente una nueva propiedad dentro de un objeto JSON existente.
|
|
||||||
|
|
||||||
### 6.2 Criptografía y Expresiones Regulares
|
## 6.1 Manipulación Nativa de Listas y Objetos JSON
|
||||||
* **`encodeSHA256` y `encodeMD5(origen, destino)`**: Funciones criptográficas que encriptan de forma irreversible un texto. Vitales para el almacenamiento seguro de contraseñas.
|
|
||||||
* **`getRegex(origen, patron, destino)`**: Aplica una Expresión Regular (`patron`) sobre la variable de origen, extrayendo las coincidencias exactas.
|
|
||||||
|
|
||||||
### 6.3 Transformación de Tiempo y Cadenas
|
Para extraer y mutar estructuras complejas, AVAP provee comandos nativos específicos. En AVAP, las listas **no se instancian con literales de array**, sino que se construyen y recorren a través de un conjunto cerrado de comandos especializados:
|
||||||
* **Fechas:** `getTimeStamp` (convierte un string a Epoch), `getDateTime` (Epoch a string legible), y `stampToDatetime` (Epoch a objeto datetime estructurado). Soportan formatos de calendario y cálculos con TimeDeltas.
|
|
||||||
* **Cadenas:** `replace` (saneamiento y sustitución de texto) y `randomString` (generación determinista de claves/tokens aleatorios).
|
|
||||||
|
|
||||||
### Especificación BNF (Sección VI)
|
* **`variableToList(elemento, destino)`**: Fuerza a que una variable escalar se convierta en una estructura iterable de lista de un único elemento. Es el punto de entrada canónico para construir una lista desde cero a partir de un valor existente.
|
||||||
|
|
||||||
|
* **`itemFromList(lista_origen, indice, destino)`**: Extrae de forma segura el elemento contenido en la posición `indice` (base 0) de una lista. Equivale a un acceso por índice controlado.
|
||||||
|
|
||||||
|
* **`getListLen(lista, destino)`**: Calcula el número total de elementos contenidos en `lista` y almacena el resultado entero en `destino`. Imprescindible para construir bucles de recorrido seguro y para validar listas antes de acceder a sus índices. Se recomienda llamar siempre a `getListLen` antes de `itemFromList` para evitar accesos fuera de rango.
|
||||||
|
|
||||||
|
* **`variableFromJSON(json_origen, clave, destino)`**: Parsea un objeto JSON en memoria y extrae el valor correspondiente a la `clave`, almacenándolo en `destino`. El acceso es directo por nombre de propiedad.
|
||||||
|
|
||||||
|
* **`AddVariableToJSON(clave, valor, json_destino)`**: Inyecta dinámicamente una nueva propiedad dentro de un objeto JSON existente. Si la clave ya existe, su valor es sobreescrito.
|
||||||
|
|
||||||
|
**Patrón de recorrido típico en AVAP:**
|
||||||
|
|
||||||
|
```avap
|
||||||
|
// 1. Obtener longitud de la lista
|
||||||
|
getListLen(myList, len)
|
||||||
|
|
||||||
|
// 2. Iterar con índice controlado
|
||||||
|
i = 0
|
||||||
|
while (i < len) {
|
||||||
|
itemFromList(myList, i, currentItem)
|
||||||
|
// ... procesar currentItem ...
|
||||||
|
i = i + 1
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 6.2 Criptografía y Expresiones Regulares
|
||||||
|
|
||||||
|
* **`encodeSHA256(origen, destino)`** y **`encodeMD5(origen, destino)`**: Funciones criptográficas que encriptan de forma irreversible un texto. Vitales para el almacenamiento seguro de contraseñas y la verificación de integridad de datos. SHA-256 produce un digest de 64 caracteres hexadecimales y ofrece mayor resistencia criptográfica que MD5 (32 caracteres); se recomienda SHA-256 para nuevos desarrollos.
|
||||||
|
|
||||||
|
* **`getRegex(origen, patron, destino)`**: Aplica una Expresión Regular (`patron`) sobre la variable de origen, extrayendo la primera coincidencia exacta encontrada. El patrón sigue la sintaxis estándar compatible con Python `re`.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 6.3 Transformación de Tiempo y Cadenas
|
||||||
|
|
||||||
|
### Fechas y Timestamps
|
||||||
|
|
||||||
|
AVAP provee tres comandos complementarios para cubrir todas las conversiones posibles entre representaciones de tiempo. Los tres soportan formatos de calendario en notación `strftime` de Python y cálculos con `TimeDelta` expresados en segundos (positivo para sumar, negativo para restar):
|
||||||
|
|
||||||
|
| Comando | Entrada | Salida |
|
||||||
|
|---|---|---|
|
||||||
|
| `getTimeStamp(fecha_string, formato, timedelta, destino)` | String de fecha | Epoch (entero) |
|
||||||
|
| `stampToDatetime(epoch, formato, timedelta, destino)` | Epoch (entero) | String de fecha |
|
||||||
|
| `getDateTime(formato, timedelta, zona_horaria, destino)` | — (ahora mismo) | String de fecha |
|
||||||
|
|
||||||
|
* **`getTimeStamp(fecha_string, formato, timedelta, destino)`**: Convierte un string de fecha legible a su valor Epoch (entero Unix). Útil para almacenar fechas y realizar cálculos aritméticos sobre ellas.
|
||||||
|
|
||||||
|
* **`stampToDatetime(epoch, formato, timedelta, destino)`**: Convierte un valor Epoch a un string de fecha con el formato especificado. Útil para presentar timestamps almacenados de forma legible.
|
||||||
|
|
||||||
|
* **`getDateTime(formato, timedelta, zona_horaria, destino)`**: Captura la fecha y hora actuales del sistema, aplica el ajuste `timedelta` y las convierte a la `zona_horaria` indicada antes de almacenar el resultado. Acepta cualquier zona horaria reconocida por la librería `pytz` de Python.
|
||||||
|
|
||||||
|
### Cadenas de Texto
|
||||||
|
|
||||||
|
* **`randomString(patron, longitud, destino)`**: Genera una cadena aleatoria de `longitud` caracteres cuyos símbolos están restringidos al conjunto definido por `patron` (expresión regular de caracteres). Útil para generar tokens de sesión, contraseñas temporales o identificadores únicos.
|
||||||
|
|
||||||
|
* **`replace(origen, patron_busqueda, reemplazo, destino)`**: Localiza todas las ocurrencias de `patron_busqueda` dentro de `origen` y las sustituye por `reemplazo`, almacenando el resultado en `destino`. Facilita el saneamiento y normalización de datos de entrada antes de su procesamiento o almacenamiento.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## BNF — Gramática Formal de los Comandos de Utilidad
|
||||||
|
|
||||||
```bnf
|
```bnf
|
||||||
/* [CORRECCIÓN] Todas las subreglas de <util_command> están ahora completamente expandidas. */
|
<util_command> ::= <json_list_cmd> | <crypto_cmd> | <regex_cmd>
|
||||||
<util_command> ::= <json_list_cmd> | <crypto_cmd> | <regex_cmd> | <datetime_cmd> | <stamp_cmd> | <string_cmd> | <replace_cmd>
|
| <datetime_cmd> | <stamp_cmd> | <string_cmd> | <replace_cmd>
|
||||||
|
|
||||||
/* Manipulación de listas y JSON */
|
/* Manipulación de listas y JSON */
|
||||||
<json_list_cmd> ::= "variableToList(" <expression> "," <identifier> ")"
|
<json_list_cmd> ::= "variableToList(" <expression> "," <identifier> ")"
|
||||||
| "itemFromList(" <identifier> "," <expression> "," <identifier> ")"
|
| "itemFromList(" <identifier> "," <expression> "," <identifier> ")"
|
||||||
| "variableFromJSON(" <identifier> "," <expression> "," <identifier> ")"
|
| "getListLen(" <identifier> "," <identifier> ")"
|
||||||
| "AddVariableToJSON(" <expression> "," <expression> "," <identifier> ")"
|
| "variableFromJSON(" <identifier> "," <expression> "," <identifier> ")"
|
||||||
|
| "AddVariableToJSON(" <expression> "," <expression> "," <identifier> ")"
|
||||||
|
|
||||||
/* Criptografía */
|
/* Criptografía */
|
||||||
<crypto_cmd> ::= "encodeSHA256(" <identifier_or_string> "," <identifier> ")"
|
<crypto_cmd> ::= "encodeSHA256(" <expression> "," <identifier> ")"
|
||||||
| "encodeMD5(" <identifier_or_string> "," <identifier> ")"
|
| "encodeMD5(" <expression> "," <identifier> ")"
|
||||||
|
|
||||||
/* Expresiones regulares */
|
/* Expresiones regulares */
|
||||||
<regex_cmd> ::= "getRegex(" <identifier> "," <stringliteral> "," <identifier> ")"
|
<regex_cmd> ::= "getRegex(" <identifier> "," <expression> "," <identifier> ")"
|
||||||
|
|
||||||
<datetime_cmd> ::= "getDateTime(" <stringliteral> "," <expression> "," <stringliteral> "," <identifier> ")"
|
/* Fecha/hora actual → string */
|
||||||
/* Argumentos: formato_salida, epoch_origen, zona_horaria, destino */
|
<datetime_cmd> ::= "getDateTime(" <stringliteral> "," <expression> "," <stringliteral> "," <identifier> ")"
|
||||||
|
/* Argumentos: formato_salida, timedelta, zona_horaria, destino */
|
||||||
|
|
||||||
<stamp_cmd> ::= "stampToDatetime(" <expression> "," <stringliteral> "," <expression> "," <identifier> ")"
|
/* Conversiones epoch ↔ string */
|
||||||
|
<stamp_cmd> ::= "stampToDatetime(" <expression> "," <stringliteral> "," <expression> "," <identifier> ")"
|
||||||
/* Argumentos: epoch_origen, formato, timedelta, destino */
|
/* Argumentos: epoch_origen, formato, timedelta, destino */
|
||||||
| "getTimeStamp(" <stringliteral> "," <stringliteral> "," <expression> "," <identifier> ")"
|
| "getTimeStamp(" <stringliteral> "," <stringliteral> "," <expression> "," <identifier> ")"
|
||||||
/* Argumentos: fecha_string, formato_entrada, timedelta, destino */
|
/* Argumentos: fecha_string, formato_entrada, timedelta, destino */
|
||||||
|
|
||||||
<string_cmd> ::= "randomString(" <expression> "," <identifier> ")"
|
/* Cadenas */
|
||||||
/* Argumentos: longitud, destino */
|
<string_cmd> ::= "randomString(" <expression> "," <expression> "," <identifier> ")"
|
||||||
|
/* Argumentos: patron, longitud, destino */
|
||||||
|
|
||||||
<replace_cmd> ::= "replace(" <identifier_or_string> "," <stringliteral> "," <stringliteral> "," <identifier> ")"
|
<replace_cmd> ::= "replace(" <identifier> "," <stringliteral> "," <stringliteral> "," <identifier> ")"
|
||||||
/* Argumentos: origen, patron_busqueda, reemplazo, destino */
|
/* Argumentos: origen, patron_busqueda, reemplazo, destino */
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## SECCIÓN VII: Arquitectura de Funciones y Ámbitos (Scopes)
|
## SECCIÓN VII: Arquitectura de Funciones y Ámbitos (Scopes)
|
||||||
|
|
|
||||||
3191
docs/avap.txt
3191
docs/avap.txt
File diff suppressed because it is too large
Load Diff
File diff suppressed because one or more lines are too long
|
|
@ -5,7 +5,6 @@ description = "Add your description here"
|
||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
requires-python = ">=3.11"
|
requires-python = ">=3.11"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"chonkie[semantic]>=1.5.6",
|
|
||||||
"grpcio>=1.78.0",
|
"grpcio>=1.78.0",
|
||||||
"grpcio-reflection>=1.78.0",
|
"grpcio-reflection>=1.78.0",
|
||||||
"grpcio-tools>=1.78.0",
|
"grpcio-tools>=1.78.0",
|
||||||
|
|
@ -28,7 +27,9 @@ dependencies = [
|
||||||
dev = [
|
dev = [
|
||||||
"beir>=2.2.0",
|
"beir>=2.2.0",
|
||||||
"boto3>=1.42.58",
|
"boto3>=1.42.58",
|
||||||
|
"chonkie[elastic,semantic]>=1.6.0",
|
||||||
"evidently>=0.7.20",
|
"evidently>=0.7.20",
|
||||||
|
"flatbuffers>=25.12.19",
|
||||||
"jupyter>=1.1.1",
|
"jupyter>=1.1.1",
|
||||||
"langfuse<3",
|
"langfuse<3",
|
||||||
"litellm>=1.82.0",
|
"litellm>=1.82.0",
|
||||||
|
|
|
||||||
|
|
@ -1,30 +1,29 @@
|
||||||
import re
|
|
||||||
import hashlib
|
|
||||||
from typing import Any
|
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
import typer
|
import typer
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
from elasticsearch import Elasticsearch
|
from elasticsearch import Elasticsearch
|
||||||
from langchain_core.documents import Document
|
|
||||||
from langchain_elasticsearch import ElasticsearchStore
|
from langchain_elasticsearch import ElasticsearchStore
|
||||||
from langchain_community.embeddings import HuggingFaceEmbeddings
|
from chonkie import SemanticChunker, MarkdownChef
|
||||||
from langchain_experimental.text_splitter import SemanticChunker
|
from transformers import AutoTokenizer
|
||||||
|
|
||||||
from src.utils.emb_factory import create_embedding_model
|
from src.utils.emb_factory import create_embedding_model
|
||||||
from scripts.pipelines.tasks.chunk import scrape_avap_docs
|
from scripts.pipelines.tasks.chunk import (
|
||||||
|
read_files,
|
||||||
|
get_chunk_docs,
|
||||||
|
convert_chunks_to_document
|
||||||
|
)
|
||||||
|
|
||||||
app = typer.Typer()
|
app = typer.Typer()
|
||||||
|
|
||||||
ELASTICSEARCH_LOCAL_URL = os.getenv("ELASTICSEARCH_LOCAL_URL")
|
ELASTICSEARCH_LOCAL_URL = os.getenv("ELASTICSEARCH_LOCAL_URL")
|
||||||
OLLAMA_LOCAL_URL = os.getenv("OLLAMA_LOCAL_URL")
|
OLLAMA_LOCAL_URL = os.getenv("OLLAMA_LOCAL_URL")
|
||||||
ELASTICSEARCH_INDEX = os.getenv("ELASTICSEARCH_INDEX")
|
|
||||||
OLLAMA_URL = os.getenv("OLLAMA_URL")
|
OLLAMA_URL = os.getenv("OLLAMA_URL")
|
||||||
OLLAMA_EMB_MODEL_NAME = os.getenv("OLLAMA_EMB_MODEL_NAME")
|
OLLAMA_EMB_MODEL_NAME = os.getenv("OLLAMA_EMB_MODEL_NAME")
|
||||||
AVAP_WEB_DOCS_URL = os.getenv("AVAP_WEB_DOCS_URL")
|
AVAP_WEB_DOCS_URL = os.getenv("AVAP_WEB_DOCS_URL")
|
||||||
|
HF_EMB_MODEL_NAME = os.getenv("HF_EMB_MODEL_NAME")
|
||||||
|
|
||||||
class DistanceStrategy(str, Enum):
|
class DistanceStrategy(str, Enum):
|
||||||
euclidean = "EUCLIDEAN_DISTANCE"
|
euclidean = "EUCLIDEAN_DISTANCE"
|
||||||
|
|
@ -33,55 +32,45 @@ class DistanceStrategy(str, Enum):
|
||||||
jaccard = "JACCARD"
|
jaccard = "JACCARD"
|
||||||
cosine = "COSINE"
|
cosine = "COSINE"
|
||||||
|
|
||||||
def clean_text(text: str) -> str:
|
|
||||||
text = text.replace("\u00a0", " ")
|
|
||||||
text = re.sub(r"\s+", " ", text).strip()
|
|
||||||
return text
|
|
||||||
|
|
||||||
def build_documents_from_folder(
|
|
||||||
folder_path: str,
|
|
||||||
) -> list[Document]:
|
|
||||||
|
|
||||||
folder = Path(folder_path)
|
|
||||||
|
|
||||||
if not folder.exists() or not folder.is_dir():
|
|
||||||
raise ValueError(f"Invalid folder path: {folder_path}")
|
|
||||||
|
|
||||||
all_documents: list[Document] = []
|
|
||||||
|
|
||||||
for file_path in folder.glob("*.txt"):
|
|
||||||
doc_text = file_path.read_text(encoding="utf-8")
|
|
||||||
|
|
||||||
if not doc_text.strip():
|
|
||||||
continue
|
|
||||||
|
|
||||||
metadata: dict[str, Any] = {
|
|
||||||
"source": file_path.name,
|
|
||||||
}
|
|
||||||
|
|
||||||
doc_text = clean_text(doc_text)
|
|
||||||
document = Document(
|
|
||||||
id=hashlib.md5(file_path.name.encode()).hexdigest(),
|
|
||||||
page_content=doc_text,
|
|
||||||
metadata={**metadata}
|
|
||||||
)
|
|
||||||
|
|
||||||
all_documents.append(document)
|
|
||||||
|
|
||||||
return all_documents
|
|
||||||
|
|
||||||
|
|
||||||
@app.command()
|
@app.command()
|
||||||
def elasticsearch_ingestion(
|
def elasticsearch_ingestion(
|
||||||
docs_folder_path: str = "ingestion/docs",
|
docs_folder_path: str = "docs",
|
||||||
|
es_index: str = "avap-docs-test-v2",
|
||||||
es_request_timeout: int = 120,
|
es_request_timeout: int = 120,
|
||||||
es_max_retries: int = 5,
|
es_max_retries: int = 5,
|
||||||
es_retry_on_timeout: bool = True,
|
es_retry_on_timeout: bool = True,
|
||||||
distance_strategy: DistanceStrategy = DistanceStrategy.cosine,
|
distance_strategy: DistanceStrategy = DistanceStrategy.cosine,
|
||||||
|
chunk_size: int = 2048,
|
||||||
|
chunk_threshold: float = 0.5,
|
||||||
|
chunk_similarity_window: int = 3,
|
||||||
|
chunk_skip_window: int = 1,
|
||||||
):
|
):
|
||||||
logger.info("Starting Elasticsearch ingestion pipeline...")
|
logger.info("Starting Elasticsearch ingestion pipeline...")
|
||||||
logger.info(f"Using docs folder path: {docs_folder_path}")
|
logger.info(f"Reading files from folder: {docs_folder_path}/LRM and {docs_folder_path}/samples...")
|
||||||
documents = build_documents_from_folder(folder_path=docs_folder_path)
|
avap_code_docs = read_files(f"{docs_folder_path}/samples")
|
||||||
|
avap_language_docs = read_files(f"{docs_folder_path}/LRM")
|
||||||
|
|
||||||
|
logger.info("Instantiating semantic chunker and chef...")
|
||||||
|
custom_tokenizer = AutoTokenizer.from_pretrained(HF_EMB_MODEL_NAME)
|
||||||
|
chef = MarkdownChef(tokenizer=custom_tokenizer)
|
||||||
|
chunker = SemanticChunker(
|
||||||
|
embedding_model=HF_EMB_MODEL_NAME,
|
||||||
|
chunk_size=chunk_size,
|
||||||
|
threshold=chunk_threshold,
|
||||||
|
similarity_window=chunk_similarity_window,
|
||||||
|
skip_window=chunk_skip_window
|
||||||
|
)
|
||||||
|
logger.info("Processing Markdown docs with chef...")
|
||||||
|
doc = chef.process(f"{docs_folder_path}/LRM/avap.md")
|
||||||
|
|
||||||
|
logger.info("Chunking AVAP Language docs...")
|
||||||
|
avap_language_docs_chunks = get_chunk_docs(avap_language_docs, chunker)
|
||||||
|
|
||||||
|
logger.info("Creating Langchain Document to index...")
|
||||||
|
avap_language_langchain_docs = convert_chunks_to_document(avap_language_docs_chunks)
|
||||||
|
avap_code_langchain_docs = convert_chunks_to_document(avap_code_docs)
|
||||||
|
avap_documents = avap_language_langchain_docs + avap_code_langchain_docs
|
||||||
|
|
||||||
logger.info("Connecting to Elasticsearch...")
|
logger.info("Connecting to Elasticsearch...")
|
||||||
try:
|
try:
|
||||||
|
|
@ -106,15 +95,19 @@ def elasticsearch_ingestion(
|
||||||
logger.exception("Failed to instantiate embeddings model.")
|
logger.exception("Failed to instantiate embeddings model.")
|
||||||
raise
|
raise
|
||||||
|
|
||||||
logger.info(f"Uploading documents to index {ELASTICSEARCH_INDEX}...")
|
logger.info(f"Checking if index {es_index} exists and deleting if it does...")
|
||||||
|
if es.indices.exists(index=es_index):
|
||||||
|
es.indices.delete(index=es_index)
|
||||||
|
|
||||||
|
logger.info(f"Uploading documents to index {es_index}...")
|
||||||
ElasticsearchStore.from_documents(
|
ElasticsearchStore.from_documents(
|
||||||
documents,
|
avap_documents,
|
||||||
embeddings,
|
embeddings,
|
||||||
client=es,
|
client=es,
|
||||||
index_name=ELASTICSEARCH_INDEX,
|
index_name=es_index,
|
||||||
distance_strategy=distance_strategy.value,
|
distance_strategy=distance_strategy.value,
|
||||||
)
|
)
|
||||||
logger.info(f"Finished uploading documents to index {ELASTICSEARCH_INDEX}.")
|
logger.info(f"Finished uploading documents to index {es_index}.")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,122 @@
|
||||||
|
import typer
|
||||||
|
import logging
|
||||||
|
|
||||||
|
from loguru import logger
|
||||||
|
from elasticsearch import Elasticsearch
|
||||||
|
from chonkie import MarkdownChef, FileFetcher, ElasticHandshake
|
||||||
|
from transformers import AutoTokenizer
|
||||||
|
|
||||||
|
from src.config import settings
|
||||||
|
from scripts.pipelines.tasks.embeddings import OllamaEmbeddings
|
||||||
|
from scripts.pipelines.tasks.chunk import merge_markdown_document
|
||||||
|
|
||||||
|
app = typer.Typer()
|
||||||
|
|
||||||
|
def get_processing_and_chunking_config(docs_extension: str, chunk_size: int,
|
||||||
|
chunk_threshold: float | None,
|
||||||
|
chunk_similarity_window: int| None,
|
||||||
|
chunk_skip_window: int | None) -> tuple[str, dict, str, dict]:
|
||||||
|
"""
|
||||||
|
Check the file extension and return the appropriate processing and chunking strategies and their kwargs.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
docs_extension (str): The file extension of the documents to be ingested.
|
||||||
|
chunk_size (int): The size of the chunks to be created.
|
||||||
|
chunk_threshold (float, optional): The threshold for semantic chunking. Required if docs_extension is .md.
|
||||||
|
chunk_similarity_window (int, optional): The similarity window for semantic chunking
|
||||||
|
chunk_skip_window (int, optional): The skip window for semantic chunking.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
tuple[str, dict, str, dict]: A tuple containing the processing strategy, its kwargs, the chunking strategy, and its kwargs.
|
||||||
|
"""
|
||||||
|
if docs_extension == ".md":
|
||||||
|
process_type = "markdown"
|
||||||
|
custom_tokenizer = AutoTokenizer.from_pretrained(settings.hf_emb_model_name)
|
||||||
|
process_kwargs = {"tokenizer": custom_tokenizer}
|
||||||
|
# process_type = "text"
|
||||||
|
# process_kwargs = {}
|
||||||
|
chunk_strat = "semantic"
|
||||||
|
chunk_kwargs = {"embedding_model": settings.hf_emb_model_name, "threshold": chunk_threshold, "chunk_size": chunk_size,
|
||||||
|
"similarity_window": chunk_similarity_window, "skip_window": chunk_skip_window}
|
||||||
|
|
||||||
|
elif docs_extension == ".avap":
|
||||||
|
process_type = "text"
|
||||||
|
process_kwargs = {}
|
||||||
|
chunk_strat = "recursive" # Once we have the BNF and uploaded to tree-sitter, we can use code (?)
|
||||||
|
chunk_kwargs = {"chunk_size": chunk_size}
|
||||||
|
|
||||||
|
return process_type, process_kwargs, chunk_strat, chunk_kwargs
|
||||||
|
|
||||||
|
|
||||||
|
@app.command()
|
||||||
|
def elasticsearch_ingestion(
|
||||||
|
docs_folder_path: str = "docs/LRM",
|
||||||
|
docs_extension: str = ".md",
|
||||||
|
es_index: str = "avap-docs-test-v3",
|
||||||
|
es_request_timeout: int = 120,
|
||||||
|
es_max_retries: int = 5,
|
||||||
|
es_retry_on_timeout: bool = True,
|
||||||
|
delete_es_index: bool = True,
|
||||||
|
chunk_size: int = 2048,
|
||||||
|
chunk_threshold: float | None = 0.5,
|
||||||
|
chunk_similarity_window: int | None = 3,
|
||||||
|
chunk_skip_window: int | None = 1
|
||||||
|
):
|
||||||
|
custom_tokenizer = AutoTokenizer.from_pretrained(settings.hf_emb_model_name)
|
||||||
|
processed_docs = []
|
||||||
|
fused_docs = []
|
||||||
|
logger.info(f"Instantiating Elasticsearch client with URL: {settings.elasticsearch_local_url}...")
|
||||||
|
es = Elasticsearch(
|
||||||
|
hosts=settings.elasticsearch_local_url,
|
||||||
|
request_timeout=es_request_timeout,
|
||||||
|
max_retries=es_max_retries,
|
||||||
|
retry_on_timeout=es_retry_on_timeout,
|
||||||
|
)
|
||||||
|
|
||||||
|
if delete_es_index and es.indices.exists(index=es_index):
|
||||||
|
logger.info(f"Deleting existing Elasticsearch index: {es_index}...")
|
||||||
|
es.indices.delete(index=es_index)
|
||||||
|
|
||||||
|
logger.info("Starting Elasticsearch ingestion pipeline...")
|
||||||
|
(process_type,
|
||||||
|
process_kwargs,
|
||||||
|
chunk_strat,
|
||||||
|
chunk_kwargs) = get_processing_and_chunking_config(docs_extension, chunk_size, chunk_threshold, chunk_similarity_window, chunk_skip_window)
|
||||||
|
|
||||||
|
logger.info(f"Fetching files from {docs_folder_path}...")
|
||||||
|
fetcher = FileFetcher()
|
||||||
|
docs = fetcher.fetch(dir=f"{settings.proj_root}/{docs_folder_path}")
|
||||||
|
|
||||||
|
logger.info(f"Processing documents with process_type: {process_type}...")
|
||||||
|
chef = MarkdownChef(tokenizer=custom_tokenizer)
|
||||||
|
for doc in docs:
|
||||||
|
processed_doc = chef.process(doc)
|
||||||
|
processed_docs.append(processed_doc)
|
||||||
|
|
||||||
|
logger.info(f"Chunking documents with chunk_strat: {chunk_strat}...")
|
||||||
|
for processed_doc in processed_docs:
|
||||||
|
fused_doc = merge_markdown_document(processed_doc)
|
||||||
|
fused_docs.append(fused_doc)
|
||||||
|
|
||||||
|
logger.info(f"Ingesting chunks in Elasticsearch index: {es_index}...")
|
||||||
|
handshake = ElasticHandshake(
|
||||||
|
client=es,
|
||||||
|
index_name=es_index,
|
||||||
|
embedding_model=OllamaEmbeddings(model=settings.ollama_emb_model_name)
|
||||||
|
)
|
||||||
|
for fused_doc in fused_docs:
|
||||||
|
handshake.write(fused_doc.chunks)
|
||||||
|
|
||||||
|
logger.info(f"Finished ingesting in {es_index}.")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
logging.basicConfig(
|
||||||
|
level=logging.INFO,
|
||||||
|
format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
app()
|
||||||
|
except Exception as exc:
|
||||||
|
logger.exception(exc)
|
||||||
|
raise
|
||||||
|
|
@ -32,12 +32,7 @@
|
||||||
"\n",
|
"\n",
|
||||||
"from src.utils.llm_factory import create_chat_model\n",
|
"from src.utils.llm_factory import create_chat_model\n",
|
||||||
"from src.utils.emb_factory import create_embedding_model\n",
|
"from src.utils.emb_factory import create_embedding_model\n",
|
||||||
"from src.config import (\n",
|
"from src.config import settings"
|
||||||
" ELASTICSEARCH_LOCAL_URL,\n",
|
|
||||||
" ELASTICSEARCH_INDEX,\n",
|
|
||||||
" OLLAMA_MODEL_NAME,\n",
|
|
||||||
" OLLAMA_EMB_MODEL_NAME\n",
|
|
||||||
")"
|
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|
@ -51,20 +46,20 @@
|
||||||
"\n",
|
"\n",
|
||||||
"llm = create_chat_model(\n",
|
"llm = create_chat_model(\n",
|
||||||
" provider=\"ollama\",\n",
|
" provider=\"ollama\",\n",
|
||||||
" model=OLLAMA_MODEL_NAME,\n",
|
" model=settings.ollama_model_name,\n",
|
||||||
" temperature=0.5,\n",
|
" temperature=0.5,\n",
|
||||||
" validate_model_on_init=True,\n",
|
" validate_model_on_init=True,\n",
|
||||||
")\n",
|
")\n",
|
||||||
"embeddings = create_embedding_model(\n",
|
"embeddings = create_embedding_model(\n",
|
||||||
" provider=\"ollama\",\n",
|
" provider=\"ollama\",\n",
|
||||||
" model=OLLAMA_EMB_MODEL_NAME,\n",
|
" model=settings.ollama_emb_model_name,\n",
|
||||||
")\n",
|
")\n",
|
||||||
"vector_store = ElasticsearchStore(\n",
|
"vector_store = ElasticsearchStore(\n",
|
||||||
" es_url=ELASTICSEARCH_LOCAL_URL,\n",
|
" es_url=settings.elasticsearch_local_url,\n",
|
||||||
" index_name=ELASTICSEARCH_INDEX,\n",
|
" index_name=\"avap-docs-test-v3\",\n",
|
||||||
" embedding=embeddings,\n",
|
" embedding=embeddings,\n",
|
||||||
" query_field=\"text\",\n",
|
" query_field=\"text\",\n",
|
||||||
" vector_query_field=\"vector\",\n",
|
" vector_query_field=\"embedding\",\n",
|
||||||
" # strategy=ElasticsearchStore.ApproxRetrievalStrategy(\n",
|
" # strategy=ElasticsearchStore.ApproxRetrievalStrategy(\n",
|
||||||
" # hybrid=True,\n",
|
" # hybrid=True,\n",
|
||||||
" # rrf={\"rank_constant\": 60, \"window_size\": 100}\n",
|
" # rrf={\"rank_constant\": 60, \"window_size\": 100}\n",
|
||||||
|
|
@ -464,44 +459,185 @@
|
||||||
"text": [
|
"text": [
|
||||||
"================================\u001b[1m Human Message \u001b[0m=================================\n",
|
"================================\u001b[1m Human Message \u001b[0m=================================\n",
|
||||||
"\n",
|
"\n",
|
||||||
"What types of includes does AVAP have?\n"
|
"What types of includes does AVAP have?\n",
|
||||||
]
|
"[reformulate] 'What types of includes does AVAP have?' → '\"avap includes type\"'\n",
|
||||||
},
|
"================================\u001b[1m Human Message \u001b[0m=================================\n",
|
||||||
{
|
"\n",
|
||||||
"ename": "ResponseError",
|
"What types of includes does AVAP have?\n",
|
||||||
"evalue": "failed to parse JSON: unexpected end of JSON input (status code: -1)",
|
"[retrieve] 3 docs fetched\n",
|
||||||
"output_type": "error",
|
"[1] id=chunk-1 source=Untitled\n",
|
||||||
"traceback": [
|
"\n",
|
||||||
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
|
"\n",
|
||||||
"\u001b[31mResponseError\u001b[39m Traceback (most recent call last)",
|
"Token:\n",
|
||||||
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[18]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m a = \u001b[43mstream_graph_updates\u001b[49m\u001b[43m(\u001b[49m\u001b[43muser_input\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43magentic_graph\u001b[49m\u001b[43m)\u001b[49m\n",
|
"\n",
|
||||||
"\u001b[36mFile \u001b[39m\u001b[32m~/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/langfuse/decorators/langfuse_decorator.py:256\u001b[39m, in \u001b[36mLangfuseDecorator._sync_observe.<locals>.sync_wrapper\u001b[39m\u001b[34m(*args, **kwargs)\u001b[39m\n\u001b[32m 254\u001b[39m result = func(*args, **kwargs)\n\u001b[32m 255\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[32m--> \u001b[39m\u001b[32m256\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_handle_exception\u001b[49m\u001b[43m(\u001b[49m\u001b[43mobservation\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43me\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 257\u001b[39m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[32m 258\u001b[39m result = \u001b[38;5;28mself\u001b[39m._finalize_call(\n\u001b[32m 259\u001b[39m observation, result, capture_output, transform_to_string\n\u001b[32m 260\u001b[39m )\n",
|
"\n",
|
||||||
"\u001b[36mFile \u001b[39m\u001b[32m~/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/langfuse/decorators/langfuse_decorator.py:520\u001b[39m, in \u001b[36mLangfuseDecorator._handle_exception\u001b[39m\u001b[34m(self, observation, e)\u001b[39m\n\u001b[32m 516\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m observation:\n\u001b[32m 517\u001b[39m _observation_params_context.get()[observation.id].update(\n\u001b[32m 518\u001b[39m level=\u001b[33m\"\u001b[39m\u001b[33mERROR\u001b[39m\u001b[33m\"\u001b[39m, status_message=\u001b[38;5;28mstr\u001b[39m(e)\n\u001b[32m 519\u001b[39m )\n\u001b[32m--> \u001b[39m\u001b[32m520\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m e\n",
|
"\n",
|
||||||
"\u001b[36mFile \u001b[39m\u001b[32m~/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/langfuse/decorators/langfuse_decorator.py:254\u001b[39m, in \u001b[36mLangfuseDecorator._sync_observe.<locals>.sync_wrapper\u001b[39m\u001b[34m(*args, **kwargs)\u001b[39m\n\u001b[32m 251\u001b[39m result = \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[32m 253\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m254\u001b[39m result = \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 255\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[32m 256\u001b[39m \u001b[38;5;28mself\u001b[39m._handle_exception(observation, e)\n",
|
"ASSIGN\n",
|
||||||
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[15]\u001b[39m\u001b[32m, line 9\u001b[39m, in \u001b[36mstream_graph_updates\u001b[39m\u001b[34m(user_input, graph)\u001b[39m\n\u001b[32m 1\u001b[39m \u001b[38;5;129m@observe\u001b[39m(name=\u001b[33m\"\u001b[39m\u001b[33mgraph_run\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m 2\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mstream_graph_updates\u001b[39m(user_input: \u001b[38;5;28mstr\u001b[39m, graph: StateGraph):\n\u001b[32m 3\u001b[39m langfuse_context.update_current_trace(\n\u001b[32m 4\u001b[39m user_id=\u001b[33m\"\u001b[39m\u001b[33malberto\u001b[39m\u001b[33m\"\u001b[39m,\n\u001b[32m 5\u001b[39m tags=[\u001b[33m\"\u001b[39m\u001b[33mavap\u001b[39m\u001b[33m\"\u001b[39m, \u001b[33m\"\u001b[39m\u001b[33mrag\u001b[39m\u001b[33m\"\u001b[39m, \u001b[33m\"\u001b[39m\u001b[33mlanggraph\u001b[39m\u001b[33m\"\u001b[39m],\n\u001b[32m 6\u001b[39m metadata={\u001b[33m\"\u001b[39m\u001b[33mfeature\u001b[39m\u001b[33m\"\u001b[39m: \u001b[33m\"\u001b[39m\u001b[33magentic-rag\u001b[39m\u001b[33m\"\u001b[39m},\n\u001b[32m 7\u001b[39m )\n\u001b[32m----> \u001b[39m\u001b[32m9\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mevent\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mgraph\u001b[49m\u001b[43m.\u001b[49m\u001b[43mstream\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 10\u001b[39m \u001b[43m \u001b[49m\u001b[43m{\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mmessages\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43m[\u001b[49m\u001b[43m{\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mrole\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43muser\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mcontent\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43muser_input\u001b[49m\u001b[43m}\u001b[49m\u001b[43m]\u001b[49m\u001b[43m}\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 11\u001b[39m \u001b[43m \u001b[49m\u001b[43mstream_mode\u001b[49m\u001b[43m=\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mvalues\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m 12\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\u001b[43m:\u001b[49m\n\u001b[32m 13\u001b[39m \u001b[43m \u001b[49m\u001b[43mevent\u001b[49m\u001b[43m[\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mmessages\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m[\u001b[49m\u001b[43m-\u001b[49m\u001b[32;43m1\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m.\u001b[49m\u001b[43mpretty_print\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 15\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m event[\u001b[33m\"\u001b[39m\u001b[33mmessages\u001b[39m\u001b[33m\"\u001b[39m][-\u001b[32m1\u001b[39m]\n",
|
"\n",
|
||||||
"\u001b[36mFile \u001b[39m\u001b[32m~/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/langgraph/pregel/main.py:2646\u001b[39m, in \u001b[36mPregel.stream\u001b[39m\u001b[34m(self, input, config, context, stream_mode, print_mode, output_keys, interrupt_before, interrupt_after, durability, subgraphs, debug, **kwargs)\u001b[39m\n\u001b[32m 2644\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m task \u001b[38;5;129;01min\u001b[39;00m loop.match_cached_writes():\n\u001b[32m 2645\u001b[39m loop.output_writes(task.id, task.writes, cached=\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[32m-> \u001b[39m\u001b[32m2646\u001b[39m \u001b[43m\u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43m_\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mrunner\u001b[49m\u001b[43m.\u001b[49m\u001b[43mtick\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 2647\u001b[39m \u001b[43m \u001b[49m\u001b[43m[\u001b[49m\u001b[43mt\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mt\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mloop\u001b[49m\u001b[43m.\u001b[49m\u001b[43mtasks\u001b[49m\u001b[43m.\u001b[49m\u001b[43mvalues\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mif\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;129;43;01mnot\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mt\u001b[49m\u001b[43m.\u001b[49m\u001b[43mwrites\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 2648\u001b[39m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mstep_timeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 2649\u001b[39m \u001b[43m \u001b[49m\u001b[43mget_waiter\u001b[49m\u001b[43m=\u001b[49m\u001b[43mget_waiter\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 2650\u001b[39m \u001b[43m \u001b[49m\u001b[43mschedule_task\u001b[49m\u001b[43m=\u001b[49m\u001b[43mloop\u001b[49m\u001b[43m.\u001b[49m\u001b[43maccept_push\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 2651\u001b[39m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\u001b[43m:\u001b[49m\n\u001b[32m 2652\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;66;43;03m# emit output\u001b[39;49;00m\n\u001b[32m 2653\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;28;43;01myield from\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43m_output\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 2654\u001b[39m \u001b[43m \u001b[49m\u001b[43mstream_mode\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mprint_mode\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msubgraphs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstream\u001b[49m\u001b[43m.\u001b[49m\u001b[43mget\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mqueue\u001b[49m\u001b[43m.\u001b[49m\u001b[43mEmpty\u001b[49m\n\u001b[32m 2655\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 2656\u001b[39m loop.after_tick()\n",
|
"[2] id=chunk-2 source=Untitled\n",
|
||||||
"\u001b[36mFile \u001b[39m\u001b[32m~/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/langgraph/pregel/_runner.py:167\u001b[39m, in \u001b[36mPregelRunner.tick\u001b[39m\u001b[34m(self, tasks, reraise, timeout, retry_policy, get_waiter, schedule_task)\u001b[39m\n\u001b[32m 165\u001b[39m t = tasks[\u001b[32m0\u001b[39m]\n\u001b[32m 166\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m167\u001b[39m \u001b[43mrun_with_retry\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 168\u001b[39m \u001b[43m \u001b[49m\u001b[43mt\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 169\u001b[39m \u001b[43m \u001b[49m\u001b[43mretry_policy\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 170\u001b[39m \u001b[43m \u001b[49m\u001b[43mconfigurable\u001b[49m\u001b[43m=\u001b[49m\u001b[43m{\u001b[49m\n\u001b[32m 171\u001b[39m \u001b[43m \u001b[49m\u001b[43mCONFIG_KEY_CALL\u001b[49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mpartial\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 172\u001b[39m \u001b[43m \u001b[49m\u001b[43m_call\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 173\u001b[39m \u001b[43m \u001b[49m\u001b[43mweakref\u001b[49m\u001b[43m.\u001b[49m\u001b[43mref\u001b[49m\u001b[43m(\u001b[49m\u001b[43mt\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 174\u001b[39m \u001b[43m \u001b[49m\u001b[43mretry_policy\u001b[49m\u001b[43m=\u001b[49m\u001b[43mretry_policy\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 175\u001b[39m \u001b[43m \u001b[49m\u001b[43mfutures\u001b[49m\u001b[43m=\u001b[49m\u001b[43mweakref\u001b[49m\u001b[43m.\u001b[49m\u001b[43mref\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfutures\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 176\u001b[39m \u001b[43m \u001b[49m\u001b[43mschedule_task\u001b[49m\u001b[43m=\u001b[49m\u001b[43mschedule_task\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 177\u001b[39m \u001b[43m \u001b[49m\u001b[43msubmit\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43msubmit\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 178\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 179\u001b[39m \u001b[43m \u001b[49m\u001b[43m}\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 180\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 181\u001b[39m \u001b[38;5;28mself\u001b[39m.commit(t, \u001b[38;5;28;01mNone\u001b[39;00m)\n\u001b[32m 182\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m exc:\n",
|
"\n",
|
||||||
"\u001b[36mFile \u001b[39m\u001b[32m~/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/langgraph/pregel/_retry.py:42\u001b[39m, in \u001b[36mrun_with_retry\u001b[39m\u001b[34m(task, retry_policy, configurable)\u001b[39m\n\u001b[32m 40\u001b[39m task.writes.clear()\n\u001b[32m 41\u001b[39m \u001b[38;5;66;03m# run the task\u001b[39;00m\n\u001b[32m---> \u001b[39m\u001b[32m42\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mtask\u001b[49m\u001b[43m.\u001b[49m\u001b[43mproc\u001b[49m\u001b[43m.\u001b[49m\u001b[43minvoke\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtask\u001b[49m\u001b[43m.\u001b[49m\u001b[43minput\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mconfig\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 43\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m ParentCommand \u001b[38;5;28;01mas\u001b[39;00m exc:\n\u001b[32m 44\u001b[39m ns: \u001b[38;5;28mstr\u001b[39m = config[CONF][CONFIG_KEY_CHECKPOINT_NS]\n",
|
"\n",
|
||||||
"\u001b[36mFile \u001b[39m\u001b[32m~/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/langgraph/_internal/_runnable.py:656\u001b[39m, in \u001b[36mRunnableSeq.invoke\u001b[39m\u001b[34m(self, input, config, **kwargs)\u001b[39m\n\u001b[32m 654\u001b[39m \u001b[38;5;66;03m# run in context\u001b[39;00m\n\u001b[32m 655\u001b[39m \u001b[38;5;28;01mwith\u001b[39;00m set_config_context(config, run) \u001b[38;5;28;01mas\u001b[39;00m context:\n\u001b[32m--> \u001b[39m\u001b[32m656\u001b[39m \u001b[38;5;28minput\u001b[39m = \u001b[43mcontext\u001b[49m\u001b[43m.\u001b[49m\u001b[43mrun\u001b[49m\u001b[43m(\u001b[49m\u001b[43mstep\u001b[49m\u001b[43m.\u001b[49m\u001b[43minvoke\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mconfig\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 657\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m 658\u001b[39m \u001b[38;5;28minput\u001b[39m = step.invoke(\u001b[38;5;28minput\u001b[39m, config)\n",
|
"> **Nota de implementación:** `<connector_instantiation>` se distingue de `<orm_connector_init>` (ORM) únicamente por contexto semántico: el UUID pasado como argumento determina si el adaptador resuelto es un ORM de base de datos o un proxy de terceros. La gramática los trata de forma idéntica; el motor de ejecución selecciona el adaptador apropiado en runtime.\n",
|
||||||
"\u001b[36mFile \u001b[39m\u001b[32m~/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/langgraph/_internal/_runnable.py:400\u001b[39m, in \u001b[36mRunnableCallable.invoke\u001b[39m\u001b[34m(self, input, config, **kwargs)\u001b[39m\n\u001b[32m 398\u001b[39m run_manager.on_chain_end(ret)\n\u001b[32m 399\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m400\u001b[39m ret = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 401\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m.recurse \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(ret, Runnable):\n\u001b[32m 402\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m ret.invoke(\u001b[38;5;28minput\u001b[39m, config)\n",
|
"\n",
|
||||||
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[10]\u001b[39m\u001b[32m, line 5\u001b[39m, in \u001b[36magent\u001b[39m\u001b[34m(state)\u001b[39m\n\u001b[32m 3\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34magent\u001b[39m(state: AgenticAgentState) -> AgenticAgentState:\n\u001b[32m 4\u001b[39m llm_with_tools = llm.bind_tools(tools)\n\u001b[32m----> \u001b[39m\u001b[32m5\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m {\u001b[33m\"\u001b[39m\u001b[33mmessages\u001b[39m\u001b[33m\"\u001b[39m: [\u001b[43mllm_with_tools\u001b[49m\u001b[43m.\u001b[49m\u001b[43minvoke\u001b[49m\u001b[43m(\u001b[49m\u001b[43m[\u001b[49m\u001b[43mSystemMessage\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcontent\u001b[49m\u001b[43m=\u001b[49m\u001b[43mAGENTIC_PROMPT\u001b[49m\u001b[43m.\u001b[49m\u001b[43mcontent\u001b[49m\u001b[43m)\u001b[49m\u001b[43m]\u001b[49m\u001b[43m \u001b[49m\u001b[43m+\u001b[49m\u001b[43m \u001b[49m\u001b[43mstate\u001b[49m\u001b[43m[\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mmessages\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m]}\n",
|
"---\n",
|
||||||
"\u001b[36mFile \u001b[39m\u001b[32m~/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/langchain_core/runnables/base.py:5695\u001b[39m, in \u001b[36mRunnableBindingBase.invoke\u001b[39m\u001b[34m(self, input, config, **kwargs)\u001b[39m\n\u001b[32m 5688\u001b[39m \u001b[38;5;129m@override\u001b[39m\n\u001b[32m 5689\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34minvoke\u001b[39m(\n\u001b[32m 5690\u001b[39m \u001b[38;5;28mself\u001b[39m,\n\u001b[32m (...)\u001b[39m\u001b[32m 5693\u001b[39m **kwargs: Any | \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[32m 5694\u001b[39m ) -> Output:\n\u001b[32m-> \u001b[39m\u001b[32m5695\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mbound\u001b[49m\u001b[43m.\u001b[49m\u001b[43minvoke\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 5696\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m 5697\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_merge_configs\u001b[49m\u001b[43m(\u001b[49m\u001b[43mconfig\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 5698\u001b[39m \u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43m{\u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m}\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 5699\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n",
|
"\n",
|
||||||
"\u001b[36mFile \u001b[39m\u001b[32m~/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/langchain_core/language_models/chat_models.py:402\u001b[39m, in \u001b[36mBaseChatModel.invoke\u001b[39m\u001b[34m(self, input, config, stop, **kwargs)\u001b[39m\n\u001b[32m 388\u001b[39m \u001b[38;5;129m@override\u001b[39m\n\u001b[32m 389\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34minvoke\u001b[39m(\n\u001b[32m 390\u001b[39m \u001b[38;5;28mself\u001b[39m,\n\u001b[32m (...)\u001b[39m\u001b[32m 395\u001b[39m **kwargs: Any,\n\u001b[32m 396\u001b[39m ) -> AIMessage:\n\u001b[32m 397\u001b[39m config = ensure_config(config)\n\u001b[32m 398\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m cast(\n\u001b[32m 399\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mAIMessage\u001b[39m\u001b[33m\"\u001b[39m,\n\u001b[32m 400\u001b[39m cast(\n\u001b[32m 401\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mChatGeneration\u001b[39m\u001b[33m\"\u001b[39m,\n\u001b[32m--> \u001b[39m\u001b[32m402\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mgenerate_prompt\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 403\u001b[39m \u001b[43m \u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_convert_input\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 404\u001b[39m \u001b[43m \u001b[49m\u001b[43mstop\u001b[49m\u001b[43m=\u001b[49m\u001b[43mstop\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 405\u001b[39m \u001b[43m \u001b[49m\u001b[43mcallbacks\u001b[49m\u001b[43m=\u001b[49m\u001b[43mconfig\u001b[49m\u001b[43m.\u001b[49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mcallbacks\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 406\u001b[39m \u001b[43m \u001b[49m\u001b[43mtags\u001b[49m\u001b[43m=\u001b[49m\u001b[43mconfig\u001b[49m\u001b[43m.\u001b[49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mtags\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 407\u001b[39m \u001b[43m \u001b[49m\u001b[43mmetadata\u001b[49m\u001b[43m=\u001b[49m\u001b[43mconfig\u001b[49m\u001b[43m.\u001b[49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mmetadata\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 408\u001b[39m \u001b[43m \u001b[49m\u001b[43mrun_name\u001b[49m\u001b[43m=\u001b[49m\u001b[43mconfig\u001b[49m\u001b[43m.\u001b[49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mrun_name\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 409\u001b[39m \u001b[43m \u001b[49m\u001b[43mrun_id\u001b[49m\u001b[43m=\u001b[49m\u001b[43mconfig\u001b[49m\u001b[43m.\u001b[49m\u001b[43mpop\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mrun_id\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 410\u001b[39m \u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 411\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m.generations[\u001b[32m0\u001b[39m][\u001b[32m0\u001b[39m],\n\u001b[32m 412\u001b[39m ).message,\n\u001b[32m 413\u001b[39m )\n",
|
"## SECCIÓN VI: Utilidades, Criptografía y Manipulación de Datos\n",
|
||||||
"\u001b[36mFile \u001b[39m\u001b[32m~/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/langchain_core/language_models/chat_models.py:1123\u001b[39m, in \u001b[36mBaseChatModel.generate_prompt\u001b[39m\u001b[34m(self, prompts, stop, callbacks, **kwargs)\u001b[39m\n\u001b[32m 1114\u001b[39m \u001b[38;5;129m@override\u001b[39m\n\u001b[32m 1115\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mgenerate_prompt\u001b[39m(\n\u001b[32m 1116\u001b[39m \u001b[38;5;28mself\u001b[39m,\n\u001b[32m (...)\u001b[39m\u001b[32m 1120\u001b[39m **kwargs: Any,\n\u001b[32m 1121\u001b[39m ) -> LLMResult:\n\u001b[32m 1122\u001b[39m prompt_messages = [p.to_messages() \u001b[38;5;28;01mfor\u001b[39;00m p \u001b[38;5;129;01min\u001b[39;00m prompts]\n\u001b[32m-> \u001b[39m\u001b[32m1123\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mgenerate\u001b[49m\u001b[43m(\u001b[49m\u001b[43mprompt_messages\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstop\u001b[49m\u001b[43m=\u001b[49m\u001b[43mstop\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcallbacks\u001b[49m\u001b[43m=\u001b[49m\u001b[43mcallbacks\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
|
"\n",
|
||||||
"\u001b[36mFile \u001b[39m\u001b[32m~/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/langchain_core/language_models/chat_models.py:933\u001b[39m, in \u001b[36mBaseChatModel.generate\u001b[39m\u001b[34m(self, messages, stop, callbacks, tags, metadata, run_name, run_id, **kwargs)\u001b[39m\n\u001b[32m 930\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m i, m \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28menumerate\u001b[39m(input_messages):\n\u001b[32m 931\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m 932\u001b[39m results.append(\n\u001b[32m--> \u001b[39m\u001b[32m933\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_generate_with_cache\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 934\u001b[39m \u001b[43m \u001b[49m\u001b[43mm\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 935\u001b[39m \u001b[43m \u001b[49m\u001b[43mstop\u001b[49m\u001b[43m=\u001b[49m\u001b[43mstop\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 936\u001b[39m \u001b[43m \u001b[49m\u001b[43mrun_manager\u001b[49m\u001b[43m=\u001b[49m\u001b[43mrun_managers\u001b[49m\u001b[43m[\u001b[49m\u001b[43mi\u001b[49m\u001b[43m]\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mif\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mrun_managers\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01melse\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[32m 937\u001b[39m \u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 938\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 939\u001b[39m )\n\u001b[32m 940\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mBaseException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[32m 941\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m run_managers:\n",
|
"AVAP incluye un set de comandos integrados de alto nivel para manipular tipos complejos (JSON y Listas), tiempos, textos y generar hashes.\n",
|
||||||
"\u001b[36mFile \u001b[39m\u001b[32m~/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/langchain_core/language_models/chat_models.py:1235\u001b[39m, in \u001b[36mBaseChatModel._generate_with_cache\u001b[39m\u001b[34m(self, messages, stop, run_manager, **kwargs)\u001b[39m\n\u001b[32m 1233\u001b[39m result = generate_from_stream(\u001b[38;5;28miter\u001b[39m(chunks))\n\u001b[32m 1234\u001b[39m \u001b[38;5;28;01melif\u001b[39;00m inspect.signature(\u001b[38;5;28mself\u001b[39m._generate).parameters.get(\u001b[33m\"\u001b[39m\u001b[33mrun_manager\u001b[39m\u001b[33m\"\u001b[39m):\n\u001b[32m-> \u001b[39m\u001b[32m1235\u001b[39m result = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_generate\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 1236\u001b[39m \u001b[43m \u001b[49m\u001b[43mmessages\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstop\u001b[49m\u001b[43m=\u001b[49m\u001b[43mstop\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mrun_manager\u001b[49m\u001b[43m=\u001b[49m\u001b[43mrun_manager\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\n\u001b[32m 1237\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 1238\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m 1239\u001b[39m result = \u001b[38;5;28mself\u001b[39m._generate(messages, stop=stop, **kwargs)\n",
|
"\n",
|
||||||
"\u001b[36mFile \u001b[39m\u001b[32m~/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/langchain_ollama/chat_models.py:1030\u001b[39m, in \u001b[36mChatOllama._generate\u001b[39m\u001b[34m(self, messages, stop, run_manager, **kwargs)\u001b[39m\n\u001b[32m 1023\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34m_generate\u001b[39m(\n\u001b[32m 1024\u001b[39m \u001b[38;5;28mself\u001b[39m,\n\u001b[32m 1025\u001b[39m messages: \u001b[38;5;28mlist\u001b[39m[BaseMessage],\n\u001b[32m (...)\u001b[39m\u001b[32m 1028\u001b[39m **kwargs: Any,\n\u001b[32m 1029\u001b[39m ) -> ChatResult:\n\u001b[32m-> \u001b[39m\u001b[32m1030\u001b[39m final_chunk = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_chat_stream_with_aggregation\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 1031\u001b[39m \u001b[43m \u001b[49m\u001b[43mmessages\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstop\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mrun_manager\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mverbose\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mverbose\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\n\u001b[32m 1032\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 1033\u001b[39m generation_info = final_chunk.generation_info\n\u001b[32m 1034\u001b[39m chat_generation = ChatGeneration(\n\u001b[32m 1035\u001b[39m message=AIMessage(\n\u001b[32m 1036\u001b[39m content=final_chunk.text,\n\u001b[32m (...)\u001b[39m\u001b[32m 1043\u001b[39m generation_info=generation_info,\n\u001b[32m 1044\u001b[39m )\n",
|
"### 6.1 Manipulación Nativa de Listas y Objetos JSON\n",
|
||||||
"\u001b[36mFile \u001b[39m\u001b[32m~/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/langchain_ollama/chat_models.py:965\u001b[39m, in \u001b[36mChatOllama._chat_stream_with_aggregation\u001b[39m\u001b[34m(self, messages, stop, run_manager, verbose, **kwargs)\u001b[39m\n\u001b[32m 956\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34m_chat_stream_with_aggregation\u001b[39m(\n\u001b[32m 957\u001b[39m \u001b[38;5;28mself\u001b[39m,\n\u001b[32m 958\u001b[39m messages: \u001b[38;5;28mlist\u001b[39m[BaseMessage],\n\u001b[32m (...)\u001b[39m\u001b[32m 962\u001b[39m **kwargs: Any,\n\u001b[32m 963\u001b[39m ) -> ChatGenerationChunk:\n\u001b[32m 964\u001b[39m final_chunk = \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m965\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mchunk\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_iterate_over_stream\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmessages\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstop\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\u001b[43m:\u001b[49m\n\u001b[32m 966\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;28;43;01mif\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mfinal_chunk\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01mis\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m:\u001b[49m\n\u001b[32m 967\u001b[39m \u001b[43m \u001b[49m\u001b[43mfinal_chunk\u001b[49m\u001b[43m \u001b[49m\u001b[43m=\u001b[49m\u001b[43m \u001b[49m\u001b[43mchunk\u001b[49m\n",
|
"Para extraer y mutar estructuras complejas, AVAP provee comandos nativos específicos:\n",
|
||||||
"\u001b[36mFile \u001b[39m\u001b[32m~/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/langchain_ollama/chat_models.py:1054\u001b[39m, in \u001b[36mChatOllama._iterate_over_stream\u001b[39m\u001b[34m(self, messages, stop, **kwargs)\u001b[39m\n\u001b[32m 1047\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34m_iterate_over_stream\u001b[39m(\n\u001b[32m 1048\u001b[39m \u001b[38;5;28mself\u001b[39m,\n\u001b[32m 1049\u001b[39m messages: \u001b[38;5;28mlist\u001b[39m[BaseMessage],\n\u001b[32m 1050\u001b[39m stop: \u001b[38;5;28mlist\u001b[39m[\u001b[38;5;28mstr\u001b[39m] | \u001b[38;5;28;01mNone\u001b[39;00m = \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[32m 1051\u001b[39m **kwargs: Any,\n\u001b[32m 1052\u001b[39m ) -> Iterator[ChatGenerationChunk]:\n\u001b[32m 1053\u001b[39m reasoning = kwargs.get(\u001b[33m\"\u001b[39m\u001b[33mreasoning\u001b[39m\u001b[33m\"\u001b[39m, \u001b[38;5;28mself\u001b[39m.reasoning)\n\u001b[32m-> \u001b[39m\u001b[32m1054\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mstream_resp\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_create_chat_stream\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmessages\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstop\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\u001b[43m:\u001b[49m\n\u001b[32m 1055\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;28;43;01mif\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;129;43;01mnot\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43misinstance\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mstream_resp\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mstr\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m:\u001b[49m\n\u001b[32m 1056\u001b[39m \u001b[43m \u001b[49m\u001b[43mcontent\u001b[49m\u001b[43m \u001b[49m\u001b[43m=\u001b[49m\u001b[43m \u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 1057\u001b[39m \u001b[43m \u001b[49m\u001b[43mstream_resp\u001b[49m\u001b[43m[\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mmessage\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m[\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mcontent\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\n\u001b[32m 1058\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;28;43;01mif\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mmessage\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mstream_resp\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01mand\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mcontent\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mstream_resp\u001b[49m\u001b[43m[\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mmessage\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\n\u001b[32m 1059\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;28;43;01melse\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\n\u001b[32m 1060\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n",
|
"* **`variableToList(elemento, destino)`**: Fuerza a que una variable escalar se convierta en una estructura iterable de lista.\n",
|
||||||
"\u001b[36mFile \u001b[39m\u001b[32m~/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/langchain_ollama/chat_models.py:952\u001b[39m, in \u001b[36mChatOllama._create_chat_stream\u001b[39m\u001b[34m(self, messages, stop, **kwargs)\u001b[39m\n\u001b[32m 950\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m chat_params[\u001b[33m\"\u001b[39m\u001b[33mstream\u001b[39m\u001b[33m\"\u001b[39m]:\n\u001b[32m 951\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m._client:\n\u001b[32m--> \u001b[39m\u001b[32m952\u001b[39m \u001b[38;5;28;01myield from\u001b[39;00m \u001b[38;5;28mself\u001b[39m._client.chat(**chat_params)\n\u001b[32m 953\u001b[39m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28mself\u001b[39m._client:\n\u001b[32m 954\u001b[39m \u001b[38;5;28;01myield\u001b[39;00m \u001b[38;5;28mself\u001b[39m._client.chat(**chat_params)\n",
|
"* **`itemFromList(lista_origen, indice, destino)`**: Extrae de forma segura el elemento contenido en la posición `indice` de una lista.\n",
|
||||||
"\u001b[36mFile \u001b[39m\u001b[32m~/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/ollama/_client.py:184\u001b[39m, in \u001b[36mClient._request.<locals>.inner\u001b[39m\u001b[34m()\u001b[39m\n\u001b[32m 182\u001b[39m part = json.loads(line)\n\u001b[32m 183\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m err := part.get(\u001b[33m'\u001b[39m\u001b[33merror\u001b[39m\u001b[33m'\u001b[39m):\n\u001b[32m--> \u001b[39m\u001b[32m184\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m ResponseError(err)\n\u001b[32m 185\u001b[39m \u001b[38;5;28;01myield\u001b[39;00m \u001b[38;5;28mcls\u001b[39m(**part)\n",
|
"* **`variableFromJSON(json_origen, clave, destino)`**: Parsea un objeto JSON en memoria y extrae el valor correspondiente a la `clave`.\n",
|
||||||
"\u001b[31mResponseError\u001b[39m: failed to parse JSON: unexpected end of JSON input (status code: -1)",
|
"* **`AddVariableToJSON(clave, valor, json_destino)`**: Inyecta dinámicamente una nueva propiedad dentro de un objeto JSON existente.\n",
|
||||||
"During task with name 'agent' and id '9110cf29-5205-b67b-0456-234df433158a'"
|
"\n",
|
||||||
|
"### 6.2 Criptografía y Expresiones Regulares\n",
|
||||||
|
"* **`encodeSHA256` y `encodeMD5(origen, destino)`**: Funciones criptográficas que encriptan de forma irreversible un texto. Vitales para el almacenamiento seguro de contraseñas.\n",
|
||||||
|
"* **`getRegex(origen, patron, destino)`**: Aplica una Expresión Regular (`patron`) sobre la variable de origen, extrayendo las coincidencias exactas.\n",
|
||||||
|
"\n",
|
||||||
|
"### 6.3 Transformación de Tiempo y Cadenas\n",
|
||||||
|
"* **Fechas:** `getTimeStamp` (convierte un string a Epoch), `getDateTime` (Epoch a string legible), y `stampToDatetime` (Epoch a objeto datetime estructurado). Soportan formatos de calendario y cálculos con TimeDeltas.\n",
|
||||||
|
"* **Cadenas:** `replace` (saneamiento y sustitución de texto) y `randomString` (generación determinista de claves/tokens aleatorios).\n",
|
||||||
|
"\n",
|
||||||
|
"### Especificación BNF (Sección VI)\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"/* [CORRECCIÓN] Todas las subreglas de <util_command> están ahora completamente expandidas. */\n",
|
||||||
|
"<util_command> ::= <json_list_cmd> | <crypto_cmd> | <regex_cmd> | <datetime_cmd> | <stamp_cmd> | <string_cmd> | <replace_cmd>\n",
|
||||||
|
"\n",
|
||||||
|
"/* Manipulación de listas y JSON */\n",
|
||||||
|
"<json_list_cmd> ::= \"variableToList(\" <expression> \",\" <identifier> \")\"\n",
|
||||||
|
" | \"itemFromList(\" <identifier> \",\" <expression> \",\" <identifier> \")\"\n",
|
||||||
|
" | \"variableFromJSON(\" <identifier> \",\" <expression> \",\" <identifier> \")\"\n",
|
||||||
|
" | \"AddVariableToJSON(\" <expression> \",\" <expression> \",\" <identifier> \")\"\n",
|
||||||
|
"\n",
|
||||||
|
"/* Criptografía */\n",
|
||||||
|
"<crypto_cmd> ::= \"encodeSHA256(\" <identifier_or_string> \",\" <identifier> \")\"\n",
|
||||||
|
" | \"encodeMD5(\" <identifier_or_string> \",\" <identifier> \")\"\n",
|
||||||
|
"\n",
|
||||||
|
"/* Expresiones regulares */\n",
|
||||||
|
"<regex_cmd> ::= \"getRegex(\" <identifier> \",\" <stringliteral> \",\" <identifier> \")\"\n",
|
||||||
|
"\n",
|
||||||
|
"<datetime_cmd> ::= \"getDateTime(\" <stringliteral> \",\" <expression> \",\" <stringliteral> \",\" <identifier> \")\"\n",
|
||||||
|
"/* Argumentos: formato_salida, epoch_origen, zona_horaria, destino */\n",
|
||||||
|
"\n",
|
||||||
|
"<stamp_cmd> ::= \"stampToDatetime(\" <expression> \",\" <stringliteral> \",\" <expression> \",\" <identifier> \")\"\n",
|
||||||
|
"/* Argumentos: epoch_origen, formato, timedelta, destino */\n",
|
||||||
|
" | \"getTimeStamp(\" <stringliteral> \",\" <stringliteral> \",\" <expression> \",\" <identifier> \")\"\n",
|
||||||
|
"/* Argumentos: fecha_string, formato_entrada, timedelta, destino */\n",
|
||||||
|
"\n",
|
||||||
|
"<string_cmd> ::= \"randomString(\" <expression> \",\" <identifier> \")\"\n",
|
||||||
|
"/* Argumentos: longitud, destino */\n",
|
||||||
|
"\n",
|
||||||
|
"<replace_cmd> ::= \"replace(\" <identifier_or_string> \",\" <stringliteral> \",\" <stringliteral> \",\" <identifier> \")\"\n",
|
||||||
|
"/* Argumentos: origen, patron_busqueda, reemplazo, destino */\n",
|
||||||
|
"\n",
|
||||||
|
"[3] id=chunk-3 source=Untitled\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"---\n",
|
||||||
|
"\n",
|
||||||
|
"## SECCIÓN IX: Expresiones y Gramática Léxica Estricta\n",
|
||||||
|
"\n",
|
||||||
|
"Esta sección es el corazón matemático evaluador de AVAP. Define la jerarquía exacta (Precedencia) y provee soporte nativo para características avanzadas similares a Python.\n",
|
||||||
|
"\n",
|
||||||
|
"### 9.1 Cast de Tipos Explícito\n",
|
||||||
|
"AVAP permite conversiones de tipos (Type Casting) en cualquier evaluación utilizando funciones constructoras estándar. Puedes transformar variables dinámicamente usando `int(var)`, `float(var)` o `str(var)`.\n",
|
||||||
|
"\n",
|
||||||
|
"### 9.2 Slicing y Comprensiones (Comprehensions)\n",
|
||||||
|
"* **Slicing (Cortes):** Puedes extraer fragmentos de listas o strings utilizando la notación de dos puntos. Ejemplo: `mi_lista[1:4]` (extrae desde el índice 1 hasta el 3).\n",
|
||||||
|
"* **Comprehensions:** AVAP soporta la construcción rápida de listas mediante iteradores en una sola línea, permitiendo filtrar y mapear colecciones enteras (ej. `[x * 2 for x in valores if x > 0]`).\n",
|
||||||
|
"\n",
|
||||||
|
"### 9.3 Análisis Léxico (Lexer) y Documentación\n",
|
||||||
|
"AVAP cuenta con tres niveles de descarte de texto para anotaciones humanas:\n",
|
||||||
|
"1. **Comentarios de Línea (`//`):** Ignora el texto hasta el salto de línea.\n",
|
||||||
|
"2. **Comentarios de Bloque (`/* ... */`):** Para aislar bloques enteros multilínea.\n",
|
||||||
|
"3. **Comentarios de Documentación (`///`):** Utilizados por analizadores de código o IDEs para generar documentación técnica automática (Docstrings) a partir del código fuente.\n",
|
||||||
|
"\n",
|
||||||
|
"### Especificación BNF (Sección IX)\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"/* Jerarquía de Expresiones (Precedencia de menor a mayor) */\n",
|
||||||
|
"<expression> ::= <logical_or>\n",
|
||||||
|
"<logical_or> ::= <logical_and> ( \"or\" <logical_and> )*\n",
|
||||||
|
"<logical_and> ::= <logical_not> ( \"and\" <logical_not> )*\n",
|
||||||
|
"<logical_not> ::= \"not\" <logical_not> | <comparison>\n",
|
||||||
|
"\n",
|
||||||
|
"<comparison> ::= <arithmetic> ( <comp_op> <arithmetic> )*\n",
|
||||||
|
"<comp_op> ::= \"==\" | \"!=\" | \"<\" | \">\" | \"<=\" | \">=\" | \"in\" | \"is\"\n",
|
||||||
|
"\n",
|
||||||
|
"<arithmetic> ::= <term> ( ( \"+\" | \"-\" ) <term> )*\n",
|
||||||
|
"<term> ::= <factor> ( ( \"*\" | \"/\" | \"%\" ) <factor> )*\n",
|
||||||
|
"<factor> ::= ( \"+\" | \"-\" ) <factor> | <power>\n",
|
||||||
|
"<power> ::= <primary> [ \"**\" <factor> ]\n",
|
||||||
|
"\n",
|
||||||
|
"/* Primarios y Átomos (Accesos, Castings, Slicing, Métodos y Funciones)\n",
|
||||||
|
" La regla <primary> cubre también el acceso a métodos de objetos conector\n",
|
||||||
|
" (conector.metodo(...)) y el acceso por clave a sus resultados (resultado[\"key\"]) */\n",
|
||||||
|
"<primary> ::= <atom>\n",
|
||||||
|
" | <primary> \".\" <identifier>\n",
|
||||||
|
" | <primary> \"[\" <expression> \"]\"\n",
|
||||||
|
" | <primary> \"[\" [<expression>] \":\" [<expression>] [\":\" [<expression>]] \"]\"\n",
|
||||||
|
" | <primary> \"(\" [<argument_list>] \")\"\n",
|
||||||
|
"\n",
|
||||||
|
"<atom> ::= <identifier>\n",
|
||||||
|
" | \"$\" <identifier>\n",
|
||||||
|
" | <literal>\n",
|
||||||
|
" | \"(\" <expression> \")\"\n",
|
||||||
|
" | <list_display>\n",
|
||||||
|
" | <dict_display>\n",
|
||||||
|
"\n",
|
||||||
|
"/* Estructuras de Datos, Comprensiones y Argumentos */\n",
|
||||||
|
"<list_display> ::= \"[\" [<argument_list>] \"]\"\n",
|
||||||
|
" | \"[\" <expression> \"for\" <identifier> \"in\" <expression> [<if_clause>] \"]\"\n",
|
||||||
|
"<if_clause> ::= \"if\" <expression>\n",
|
||||||
|
"<dict_display> ::= \"{\" [<key_datum_list>] \"}\"\n",
|
||||||
|
"<key_datum_list> ::= <key_datum> ( \",\" <key_datum> )*\n",
|
||||||
|
"<key_datum> ::= <expression> \":\" <expression>\n",
|
||||||
|
"<argument_list> ::= <expression> ( \",\" <expression> )*\n",
|
||||||
|
"\n",
|
||||||
|
"/* Tipo numérico unificado */\n",
|
||||||
|
"<number> ::= <floatnumber> | <integer>\n",
|
||||||
|
"\n",
|
||||||
|
"/* Literales (Tipos de Datos Primitivos Soportados) */\n",
|
||||||
|
"<literal> ::= <stringliteral> | <number> | <boolean> | \"None\"\n",
|
||||||
|
"<boolean> ::= \"True\" | \"False\"\n",
|
||||||
|
"<integer> ::= [0-9]+\n",
|
||||||
|
"<floatnumber> ::= [0-9]+ \".\" [0-9]* | \".\" [0-9]+\n",
|
||||||
|
"\n",
|
||||||
|
"/* Cadenas de Texto con soporte de secuencias de escape */\n",
|
||||||
|
"<stringliteral> ::= \"\\\"\" <text_double> \"\\\"\" | \"'\" <text_single> \"'\"\n",
|
||||||
|
"<escape_sequence> ::= \"\\\\\" ( \"\\\"\" | \"'\" | \"\\\\\" | \"n\" | \"t\" | \"r\" | \"0\" )\n",
|
||||||
|
"<text_double> ::= ( [^\"\\\\] | <escape_sequence> )*\n",
|
||||||
|
"<text_single> ::= ( [^'\\\\] | <escape_sequence> )*\n",
|
||||||
|
"<identifier_or_string> ::= <identifier> | <stringliteral>\n",
|
||||||
|
"\n",
|
||||||
|
"/* Reglas de Comentarios para el Lexer\n",
|
||||||
|
" El lexer aplica longest-match: /// debe evaluarse ANTES que // */\n",
|
||||||
|
"<doc_comment> ::= \"///\" <any_text>\n",
|
||||||
|
"<line_comment> ::= \"//\" <any_text>\n",
|
||||||
|
"<block_comment> ::= \"/*\" <any_content> \"*/\"\n",
|
||||||
|
"<any_text> ::= [^\\r\\n]*\n",
|
||||||
|
"<any_content> ::= /* Cualquier secuencia de caracteres que no contenga la subcadena \"*/\" */\n",
|
||||||
|
"================================\u001b[1m Human Message \u001b[0m=================================\n",
|
||||||
|
"\n",
|
||||||
|
"What types of includes does AVAP have?\n",
|
||||||
|
"==================================\u001b[1m Ai Message \u001b[0m==================================\n",
|
||||||
|
"\n",
|
||||||
|
"AVAP has two main types of include:\n",
|
||||||
|
"\n",
|
||||||
|
"1. **<connector_instantiation>:** This is used to instantiate a connector, which could be for a database connection or a third-party API.\n",
|
||||||
|
"2. **<orm_connector_init>:** This term seems to be related to initializing an Object-Relational Mapping (ORM) connector, indicating that the context suggests it's part of a specific ORM setup.\n",
|
||||||
|
"\n",
|
||||||
|
"Both types are treated similarly in terms of grammar but differ semantically by their purpose - one is for database connections or third-party APIs, while the other is specifically for connecting to ORMs. The engine selects the appropriate adapter based on runtime context.\n"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"a = stream_graph_updates(user_input, agentic_graph)"
|
"a = stream_graph_updates(user_input, guided_graph)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|
|
||||||
|
|
@ -1,134 +1,60 @@
|
||||||
from enum import Enum
|
|
||||||
import typer
|
import typer
|
||||||
import logging
|
import logging
|
||||||
import os
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
from elasticsearch import Elasticsearch
|
|
||||||
from langchain_elasticsearch import ElasticsearchStore
|
|
||||||
from chonkie import SemanticChunker
|
|
||||||
|
|
||||||
from src.utils.emb_factory import create_embedding_model
|
|
||||||
from scripts.pipelines.tasks.chunk import (
|
from scripts.pipelines.tasks.chunk import (
|
||||||
read_files,
|
fetch_documents,
|
||||||
get_chunk_docs,
|
process_documents,
|
||||||
convert_chunks_to_document
|
export_documents,
|
||||||
|
ingest_documents
|
||||||
)
|
)
|
||||||
|
|
||||||
app = typer.Typer()
|
app = typer.Typer()
|
||||||
|
|
||||||
ELASTICSEARCH_LOCAL_URL = os.getenv("ELASTICSEARCH_LOCAL_URL")
|
|
||||||
OLLAMA_LOCAL_URL = os.getenv("OLLAMA_LOCAL_URL")
|
|
||||||
ELASTICSEARCH_INDEX = os.getenv("ELASTICSEARCH_INDEX")
|
|
||||||
OLLAMA_URL = os.getenv("OLLAMA_URL")
|
|
||||||
OLLAMA_EMB_MODEL_NAME = os.getenv("OLLAMA_EMB_MODEL_NAME")
|
|
||||||
AVAP_WEB_DOCS_URL = os.getenv("AVAP_WEB_DOCS_URL")
|
|
||||||
HF_EMB_MODEL_NAME = os.getenv("HF_EMB_MODEL_NAME")
|
|
||||||
|
|
||||||
class DistanceStrategy(str, Enum):
|
|
||||||
euclidean = "EUCLIDEAN_DISTANCE"
|
|
||||||
max_inner_product = "MAX_INNER_PRODUCT"
|
|
||||||
dot_product = "DOT_PRODUCT"
|
|
||||||
jaccard = "JACCARD"
|
|
||||||
cosine = "COSINE"
|
|
||||||
|
|
||||||
|
|
||||||
@app.command()
|
@app.command()
|
||||||
def elasticsearch_ingestion(
|
def elasticsearch_ingestion(
|
||||||
docs_folder_path: str = "docs",
|
docs_folder_path: str = "docs/samples",
|
||||||
|
output_path: str = "ingestion/chunks.json",
|
||||||
|
docs_extension: list[str] = [".md", ".avap"],
|
||||||
|
es_index: str = "avap-docs-test-v3",
|
||||||
es_request_timeout: int = 120,
|
es_request_timeout: int = 120,
|
||||||
es_max_retries: int = 5,
|
es_max_retries: int = 5,
|
||||||
es_retry_on_timeout: bool = True,
|
es_retry_on_timeout: bool = True,
|
||||||
distance_strategy: DistanceStrategy = DistanceStrategy.cosine,
|
delete_es_index: bool = True
|
||||||
chunk_size: int = 2048,
|
) -> None:
|
||||||
chunk_threshold: float = 0.5,
|
"""
|
||||||
chunk_similarity_window: int = 3,
|
Pipeline to ingest documents into an Elasticsearch index.
|
||||||
chunk_skip_window: int = 1,
|
The pipeline includes fetching documents from a specified folder, processing them into chunks, and then ingesting those chunks into the specified Elasticsearch index.
|
||||||
):
|
|
||||||
|
Args:
|
||||||
|
docs_folder_path (str): Path to the folder containing documents to be ingested. Default is "docs/samples".
|
||||||
|
docs_extension (list[str]): List of file extensions to filter by (e.g., [".md", ".avap"]). Default is [".md", ".avap"].
|
||||||
|
es_index (str): Name of the Elasticsearch index to ingest documents into. Default is "avap-docs-test-v3".
|
||||||
|
es_request_timeout (int): Timeout in seconds for Elasticsearch requests. Default is 120 seconds.
|
||||||
|
es_max_retries (int): Maximum number of retries for Elasticsearch requests in case of failure. Default is 5 retries.
|
||||||
|
es_retry_on_timeout (bool): Whether to retry Elasticsearch requests on timeout. Default is True.
|
||||||
|
delete_es_index (bool): Whether to delete the existing Elasticsearch index before ingestion. Default is True.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
None
|
||||||
|
"""
|
||||||
logger.info("Starting Elasticsearch ingestion pipeline...")
|
logger.info("Starting Elasticsearch ingestion pipeline...")
|
||||||
logger.info(f"Reading and concatenating files from folder: {docs_folder_path}/developer.avapframework.com")
|
logger.info(f"Fetching files from {docs_folder_path}...")
|
||||||
avap_github_docs = read_files(f"{docs_folder_path}/avap_language_github_docs", concatenate=False)
|
docs_path = fetch_documents(docs_folder_path, docs_extension)
|
||||||
avap_web_docs_intro = read_files(f"{docs_folder_path}/developer.avapframework.com", "intro", concatenate=True)
|
|
||||||
|
|
||||||
# Check chapters in developer.avapframework.com folder and read and concatenate files for each chapter
|
logger.info("Processing docs...")
|
||||||
chapters = sorted({
|
chunked_docs = process_documents(docs_path)
|
||||||
p.name.split("_")[0]
|
|
||||||
for p in Path(f"{docs_folder_path}/developer.avapframework.com").glob("chapter*.md")
|
|
||||||
})
|
|
||||||
|
|
||||||
avap_web_docs_chapters = [
|
logger.info(f"Ingesting chunks in Elasticsearch index: {es_index}...")
|
||||||
item
|
elasticsearch_docs = ingest_documents(chunked_docs, es_index, es_request_timeout, es_max_retries,
|
||||||
for chapter in chapters
|
es_retry_on_timeout, delete_es_index)
|
||||||
for item in read_files(
|
|
||||||
f"{docs_folder_path}/developer.avapframework.com",
|
|
||||||
f"{chapter}_",
|
|
||||||
concatenate=True
|
|
||||||
)
|
|
||||||
]
|
|
||||||
|
|
||||||
avap_web_docs_appendices = read_files(f"{docs_folder_path}/developer.avapframework.com", "appendices_", concatenate=False)
|
logger.info(f"Exporting processed documents to {output_path}...")
|
||||||
avap_samples_docs = read_files(f"{docs_folder_path}/samples", concatenate=False)
|
export_documents(elasticsearch_docs, output_path)
|
||||||
|
|
||||||
logger.info("Instantiating semantic chunker...")
|
logger.info(f"Finished ingesting in {es_index}.")
|
||||||
chunker = SemanticChunker(
|
|
||||||
embedding_model=HF_EMB_MODEL_NAME,
|
|
||||||
chunk_size=chunk_size,
|
|
||||||
threshold=chunk_threshold,
|
|
||||||
similarity_window=chunk_similarity_window,
|
|
||||||
skip_window=chunk_skip_window
|
|
||||||
)
|
|
||||||
|
|
||||||
logger.info("Chunking AVAP GitHub docs...")
|
|
||||||
avap_github_docs_chunks = get_chunk_docs(avap_github_docs, chunker)
|
|
||||||
|
|
||||||
logger.info("Chunking AVAP web docs chapters...")
|
|
||||||
avap_web_docs_chapters_chunks = get_chunk_docs(avap_web_docs_chapters, chunker)
|
|
||||||
|
|
||||||
logger.info("Creating Langchain Document to index...")
|
|
||||||
avap_github_langchain_docs = convert_chunks_to_document(avap_github_docs_chunks)
|
|
||||||
avap_web_chapters_langchain_docs = convert_chunks_to_document(avap_web_docs_chapters_chunks)
|
|
||||||
avap_web_intro_langchain_docs = convert_chunks_to_document(avap_web_docs_intro)
|
|
||||||
avap_web_appendices_langchain_docs = convert_chunks_to_document(avap_web_docs_appendices)
|
|
||||||
avap_samples_langchain_docs = convert_chunks_to_document(avap_samples_docs)
|
|
||||||
avap_documents = avap_github_langchain_docs + avap_web_chapters_langchain_docs + avap_web_intro_langchain_docs + avap_web_appendices_langchain_docs + avap_samples_langchain_docs
|
|
||||||
|
|
||||||
logger.info("Connecting to Elasticsearch...")
|
|
||||||
try:
|
|
||||||
es = Elasticsearch(
|
|
||||||
ELASTICSEARCH_LOCAL_URL,
|
|
||||||
request_timeout=es_request_timeout,
|
|
||||||
max_retries=es_max_retries,
|
|
||||||
retry_on_timeout=es_retry_on_timeout,
|
|
||||||
)
|
|
||||||
except:
|
|
||||||
logger.exception("Failed to connect to Elasticsearch.")
|
|
||||||
raise
|
|
||||||
|
|
||||||
logger.info("Instantiating embeddings model...")
|
|
||||||
try:
|
|
||||||
embeddings = create_embedding_model(
|
|
||||||
provider="ollama",
|
|
||||||
model=OLLAMA_EMB_MODEL_NAME,
|
|
||||||
base_url=OLLAMA_LOCAL_URL,
|
|
||||||
)
|
|
||||||
except:
|
|
||||||
logger.exception("Failed to instantiate embeddings model.")
|
|
||||||
raise
|
|
||||||
|
|
||||||
logger.info(f"Checking if index {ELASTICSEARCH_INDEX} exists and deleting if it does...")
|
|
||||||
if es.indices.exists(index=ELASTICSEARCH_INDEX):
|
|
||||||
es.indices.delete(index=ELASTICSEARCH_INDEX)
|
|
||||||
|
|
||||||
logger.info(f"Uploading documents to index {ELASTICSEARCH_INDEX}...")
|
|
||||||
ElasticsearchStore.from_documents(
|
|
||||||
avap_documents,
|
|
||||||
embeddings,
|
|
||||||
client=es,
|
|
||||||
index_name=ELASTICSEARCH_INDEX,
|
|
||||||
distance_strategy=distance_strategy.value,
|
|
||||||
)
|
|
||||||
logger.info(f"Finished uploading documents to index {ELASTICSEARCH_INDEX}.")
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
|
||||||
|
|
@ -1,9 +1,9 @@
|
||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
"""
|
"""
|
||||||
Use:
|
Use:
|
||||||
python generate_mbpp_avap.py
|
python generate_mbap.py
|
||||||
python generate_mbpp_avap.py --lrm path/to/avap.md
|
python generate_mbap.py --lrm path/to/avap.md
|
||||||
python generate_mbpp_avap.py --lrm avap.md --output output/mbpp_avap.json --problems 300
|
python generate_mbap.py --lrm avap.md --output output/mbpp_avap.json --problems 300
|
||||||
|
|
||||||
Requirements:
|
Requirements:
|
||||||
pip install anthropic
|
pip install anthropic
|
||||||
|
|
@ -53,7 +53,7 @@ REGLAS ESTRICTAS para el código AVAP generado:
|
||||||
5. if() Modo 1: if(var_o_literal, var_o_literal, "operador")
|
5. if() Modo 1: if(var_o_literal, var_o_literal, "operador")
|
||||||
— los argumentos NO pueden ser expresiones de acceso como dict['key'];
|
— los argumentos NO pueden ser expresiones de acceso como dict['key'];
|
||||||
hay que extraer el valor a una variable propia primero.
|
hay que extraer el valor a una variable propia primero.
|
||||||
6. if() Modo 2: if(None, None, "expresion_completa_como_string")
|
6. if() Modo 2: if(None, None, `expresion_completa_como_string`)
|
||||||
7. _status se asigna con: addVar(_status, 404)
|
7. _status se asigna con: addVar(_status, 404)
|
||||||
8. ormAccessSelect firma: ormAccessSelect(campos, "tabla", selector, varTarget)
|
8. ormAccessSelect firma: ormAccessSelect(campos, "tabla", selector, varTarget)
|
||||||
— selector puede ser cadena vacía.
|
— selector puede ser cadena vacía.
|
||||||
|
|
@ -62,7 +62,7 @@ REGLAS ESTRICTAS para el código AVAP generado:
|
||||||
|
|
||||||
MODO DE EJECUCIÓN — MUY IMPORTANTE:
|
MODO DE EJECUCIÓN — MUY IMPORTANTE:
|
||||||
- El código se ejecuta DIRECTAMENTE, línea a línea, sin servidor ni registro de endpoints.
|
- El código se ejecuta DIRECTAMENTE, línea a línea, sin servidor ni registro de endpoints.
|
||||||
- NUNCA uses registerEndpoint(), NUNCA uses mainHandler(), NUNCA envuelvas el código en funciones solo para ejecutarlo salvo que queramos probar la funcionalidad de funciones.
|
- NUNCA uses registerEndpoint(), NUNCA uses mainHandler(), NUNCA envuelvas el código en funciones solo para ejecutarlo.
|
||||||
- El código correcto es simplemente las instrucciones en línea, por ejemplo:
|
- El código correcto es simplemente las instrucciones en línea, por ejemplo:
|
||||||
result = "Hello World"
|
result = "Hello World"
|
||||||
addResult(result)
|
addResult(result)
|
||||||
|
|
@ -82,29 +82,48 @@ Estructura exacta de cada elemento:
|
||||||
"task_id": <número entero>,
|
"task_id": <número entero>,
|
||||||
"text": "<enunciado del problema en español>",
|
"text": "<enunciado del problema en español>",
|
||||||
"code": "<código AVAP con saltos de línea como \\n>",
|
"code": "<código AVAP con saltos de línea como \\n>",
|
||||||
|
"test_inputs": { "<param1>": <valor1>, "<param2>": <valor2> },
|
||||||
"test_list": ["<expr_python_1>", "<expr_python_2>"]
|
"test_list": ["<expr_python_1>", "<expr_python_2>"]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
FORMATO DE test_inputs — MUY IMPORTANTE:
|
||||||
|
- Es un objeto JSON con un valor fijo para cada variable que el código recibe via addParam().
|
||||||
|
- Los nombres de las claves deben coincidir EXACTAMENTE con el nombre de variable usado en addParam().
|
||||||
|
- Los valores deben ser concretos y representativos del problema (no genéricos como "test" o 123).
|
||||||
|
- Si el código no tiene ningún addParam(), el campo test_inputs debe ser un objeto vacío: {}
|
||||||
|
- Estos valores son los que el evaluador inyectará en el stack antes de ejecutar el código,
|
||||||
|
de modo que las aserciones de test_list puedan validar las variables de salida resultantes.
|
||||||
|
|
||||||
|
Ejemplo con addParam:
|
||||||
|
código: addParam("password", password)\\nencodeSHA256(password, hashed)\\naddResult(hashed)
|
||||||
|
test_inputs: { "password": "secret123" }
|
||||||
|
test_list: ["re.match(r'^[a-f0-9]{64}$', hashed)"]
|
||||||
|
|
||||||
|
Ejemplo sin addParam:
|
||||||
|
código: randomString(16, token)\\naddResult(token)
|
||||||
|
test_inputs: {}
|
||||||
|
test_list: ["re.match(r'^[a-zA-Z0-9]{16}$', token)"]
|
||||||
|
|
||||||
FORMATO DE test_list — MUY IMPORTANTE:
|
FORMATO DE test_list — MUY IMPORTANTE:
|
||||||
Cada aserción debe ser una expresión Python con re.match() o re.search()
|
Cada aserción debe ser una expresión Python con re.match()
|
||||||
evaluable directamente sobre las variables del stack AVAP (disponibles como
|
evaluable directamente sobre las variables del stack AVAP (disponibles como
|
||||||
variables Python locales). El módulo 're' está siempre disponible.
|
variables Python locales). El módulo 're' está siempre disponible.
|
||||||
La expresión debe devolver un match object (truthy) si el test pasa.
|
La expresión debe devolver un match object (truthy) si el test pasa.
|
||||||
|
|
||||||
Reglas estrictas:
|
Reglas estrictas:
|
||||||
- USA ÚNICAMENTE re.match(r'<patrón>', <variable>) o re.search(r'<patrón>', str(<variable>))
|
- USA ÚNICAMENTE re.match(r'<patrón>', <variable>)
|
||||||
|
- NO combines expresiones re.match en una aserción, cada asercion tiene que ser un unico re.match(r'<patrón>', <variable>)
|
||||||
- Convierte a string si es necesario: re.match(r'^\\d+$', str(result))
|
- Convierte a string si es necesario: re.match(r'^\\d+$', str(result))
|
||||||
- Puedes encadenar con 'and': re.match(r'^[a-zA-Z0-9]{32}$', token) and re.match(r'.{32}', token)
|
- Puedes encadenar con 'and': re.match(r'^[a-zA-Z0-9]{32}$', token) and re.match(r'.{32}', token)
|
||||||
- Las variables referenciadas deben existir en el stack tras ejecutar el código.
|
- Las variables referenciadas deben existir en el stack tras ejecutar el código.
|
||||||
- NUNCA uses comparaciones directas (==, !=, >, <).
|
- NUNCA uses comparaciones directas (==, !=, >, <).
|
||||||
- NUNCA uses isinstance(), len(), assert, ni texto descriptivo.
|
- NUNCA uses isinstance(), len(), assert, ni texto descriptivo.
|
||||||
- NUNCA uses nada que no sea re.match() o re.search().
|
- NUNCA uses nada que no sea re.match().
|
||||||
|
|
||||||
Ejemplos correctos de test_list:
|
Ejemplos correctos de test_list:
|
||||||
"re.match(r'^[a-f0-9]{64}$', hashed)"
|
"re.match(r'^[a-f0-9]{64}$', hashed)"
|
||||||
"re.match(r'^[a-zA-Z0-9]{32}$', token)"
|
"re.match(r'^[a-zA-Z0-9]{32}$', token)"
|
||||||
"re.match(r'^\\d{4}-\\d{2}-\\d{2}$', date_str)"
|
"re.match(r'^\\d{4}-\\d{2}-\\d{2}$', date_str)"
|
||||||
"re.search(r'Hello', result)"
|
|
||||||
"re.match(r'^-?\\d+(\\.\\d+)?$', str(result))"
|
"re.match(r'^-?\\d+(\\.\\d+)?$', str(result))"
|
||||||
"re.match(r'^(par|impar)$', result)"
|
"re.match(r'^(par|impar)$', result)"
|
||||||
"re.match(r'^40[134]$', str(_status))"
|
"re.match(r'^40[134]$', str(_status))"
|
||||||
|
|
@ -138,22 +157,26 @@ Responde ÚNICAMENTE con el array JSON. Sin texto antes ni después.
|
||||||
|
|
||||||
def parse_response(raw: str):
|
def parse_response(raw: str):
|
||||||
text = raw.strip()
|
text = raw.strip()
|
||||||
|
|
||||||
if text.startswith("```"):
|
if text.startswith("```"):
|
||||||
lines = text.splitlines()
|
lines = text.splitlines()
|
||||||
inner = lines[1:]
|
inner = lines[1:]
|
||||||
if inner and inner[-1].strip() == "```":
|
if inner and inner[-1].strip() == "```":
|
||||||
inner = inner[:-1]
|
inner = inner[:-1]
|
||||||
text = "\n".join(inner).strip()
|
text = "\n".join(inner).strip()
|
||||||
|
|
||||||
problems = json.loads(text)
|
problems = json.loads(text)
|
||||||
|
|
||||||
if not isinstance(problems, list):
|
if not isinstance(problems, list):
|
||||||
raise ValueError("answer is not a JSON.")
|
raise ValueError("response is not an JSON array")
|
||||||
|
|
||||||
for p in problems:
|
for p in problems:
|
||||||
for field in ("task_id", "text", "code", "test_list"):
|
for field in ("task_id", "text", "code", "test_list"):
|
||||||
if field not in p:
|
if field not in p:
|
||||||
raise ValueError(f"field '{field}' not found in a problem.")
|
raise ValueError(f"Field missing '{field}' in task_id={p.get('task_id','?')}.")
|
||||||
|
if "test_inputs" not in p:
|
||||||
|
p["test_inputs"] = {}
|
||||||
|
if not isinstance(p["test_inputs"], dict):
|
||||||
|
raise ValueError(f"'test_inputs' must by a JSON Object (task_id={p.get('task_id','?')}).")
|
||||||
|
|
||||||
return problems
|
return problems
|
||||||
|
|
||||||
|
|
@ -8,8 +8,7 @@ from botocore.config import Config
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from langchain_core.messages import SystemMessage, HumanMessage
|
from langchain_core.messages import SystemMessage, HumanMessage
|
||||||
from src.utils.llm_factory import create_chat_model
|
from src.utils.llm_factory import create_chat_model
|
||||||
from src.config import RAW_DIR, INTERIM_DIR
|
from scripts.pipelines.tasks.prompts import get_prompt_mbpp
|
||||||
from scripts.pipelines.input.prompts import get_prompt_mbpp
|
|
||||||
|
|
||||||
|
|
||||||
app = typer.Typer()
|
app = typer.Typer()
|
||||||
|
|
|
||||||
|
|
@ -1,136 +1,277 @@
|
||||||
import os
|
import json
|
||||||
import re
|
from copy import deepcopy
|
||||||
import uuid
|
from dataclasses import replace
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any, Union
|
||||||
|
|
||||||
|
from chonkie import (
|
||||||
|
Chunk,
|
||||||
|
ElasticHandshake,
|
||||||
|
FileFetcher,
|
||||||
|
MarkdownChef,
|
||||||
|
TextChef,
|
||||||
|
TokenChunker,
|
||||||
|
MarkdownDocument
|
||||||
|
)
|
||||||
|
from elasticsearch import Elasticsearch
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
from chonkie import Chunk, SemanticChunker
|
from transformers import AutoTokenizer
|
||||||
from langchain_core.documents import Document
|
|
||||||
|
from scripts.pipelines.tasks.embeddings import OllamaEmbeddings
|
||||||
|
from src.config import settings
|
||||||
|
|
||||||
|
|
||||||
def replace_javascript_with_avap(text: str) -> str:
|
def _get_text(element) -> str:
|
||||||
"""
|
for attr in ("text", "content", "markdown"):
|
||||||
Replace mentions of javascript language with avap in the text.
|
value = getattr(element, attr, None)
|
||||||
Handles code blocks, language identifiers, and references.
|
if isinstance(value, str):
|
||||||
|
return value
|
||||||
Args:
|
raise AttributeError(
|
||||||
text: The text to process.
|
f"Could not extract text from element of type {type(element).__name__}"
|
||||||
|
|
||||||
Returns:
|
|
||||||
The text with javascript references replaced with avap.
|
|
||||||
"""
|
|
||||||
# Replace ```javascript with ```avap
|
|
||||||
text = text.replace("```javascript", "```avap")
|
|
||||||
|
|
||||||
# Replace ```js with ```avap
|
|
||||||
text = text.replace("```js", "```avap")
|
|
||||||
|
|
||||||
# Replace common phrases (case-insensitive)
|
|
||||||
text = re.sub(r"\bjavascript\s+code\b", "avap code", text, flags=re.IGNORECASE)
|
|
||||||
text = re.sub(
|
|
||||||
r"\bjavascript\s+example\b", "avap example", text, flags=re.IGNORECASE
|
|
||||||
)
|
)
|
||||||
text = re.sub(r"\bjavascript\b(?!\s+file)", "avap", text, flags=re.IGNORECASE)
|
|
||||||
|
|
||||||
return text
|
|
||||||
|
|
||||||
|
|
||||||
def read_files(
|
def _merge_markdown_document(processed_doc: MarkdownDocument) -> MarkdownDocument:
|
||||||
folder_path: str, file_prefix: str | None = None, concatenate: bool = True
|
elements = []
|
||||||
) -> list[dict]:
|
|
||||||
|
for chunk in processed_doc.chunks:
|
||||||
|
elements.append(("chunk", chunk.start_index, chunk.end_index, chunk))
|
||||||
|
|
||||||
|
for code in processed_doc.code:
|
||||||
|
elements.append(("code", code.start_index, code.end_index, code))
|
||||||
|
|
||||||
|
for table in processed_doc.tables:
|
||||||
|
elements.append(("table", table.start_index, table.end_index, table))
|
||||||
|
|
||||||
|
elements.sort(key=lambda item: (item[1], item[2]))
|
||||||
|
|
||||||
|
merged_chunks = []
|
||||||
|
current_chunk = None
|
||||||
|
current_parts = []
|
||||||
|
current_end_index = None
|
||||||
|
current_token_count = None
|
||||||
|
|
||||||
|
def flush():
|
||||||
|
nonlocal current_chunk, current_parts, current_end_index, current_token_count
|
||||||
|
|
||||||
|
if current_chunk is None:
|
||||||
|
return
|
||||||
|
|
||||||
|
merged_text = "\n\n".join(part for part in current_parts if part)
|
||||||
|
|
||||||
|
merged_chunks.append(
|
||||||
|
replace(
|
||||||
|
current_chunk,
|
||||||
|
text=merged_text,
|
||||||
|
end_index=current_end_index,
|
||||||
|
token_count=current_token_count,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
current_chunk = None
|
||||||
|
current_parts = []
|
||||||
|
current_end_index = None
|
||||||
|
current_token_count = None
|
||||||
|
|
||||||
|
for kind, _, _, element in elements:
|
||||||
|
if kind == "chunk":
|
||||||
|
flush()
|
||||||
|
current_chunk = element
|
||||||
|
current_parts = [_get_text(element)]
|
||||||
|
current_end_index = element.end_index
|
||||||
|
current_token_count = element.token_count
|
||||||
|
continue
|
||||||
|
|
||||||
|
if current_chunk is None:
|
||||||
|
continue
|
||||||
|
|
||||||
|
current_parts.append(_get_text(element))
|
||||||
|
current_end_index = max(current_end_index, element.end_index)
|
||||||
|
current_token_count += getattr(element, "token_count", 0)
|
||||||
|
|
||||||
|
flush()
|
||||||
|
|
||||||
|
fused_processed_doc = deepcopy(processed_doc)
|
||||||
|
fused_processed_doc.chunks = merged_chunks
|
||||||
|
fused_processed_doc.code = processed_doc.code
|
||||||
|
fused_processed_doc.tables = processed_doc.tables
|
||||||
|
|
||||||
|
return fused_processed_doc
|
||||||
|
|
||||||
|
|
||||||
|
class ElasticHandshakeWithMetadata(ElasticHandshake):
|
||||||
|
"""Extended ElasticHandshake that preserves chunk metadata in Elasticsearch."""
|
||||||
|
|
||||||
|
def _create_bulk_actions(self, chunks: list[dict]) -> list[dict[str, Any]]:
|
||||||
|
"""Generate bulk actions including metadata."""
|
||||||
|
actions = []
|
||||||
|
embeddings = self.embedding_model.embed_batch([chunk["chunk"].text for chunk in chunks])
|
||||||
|
|
||||||
|
for i, chunk in enumerate(chunks):
|
||||||
|
source = {
|
||||||
|
"text": chunk["chunk"].text,
|
||||||
|
"embedding": embeddings[i],
|
||||||
|
"start_index": chunk["chunk"].start_index,
|
||||||
|
"end_index": chunk["chunk"].end_index,
|
||||||
|
"token_count": chunk["chunk"].token_count,
|
||||||
|
}
|
||||||
|
|
||||||
|
# Include metadata if it exists
|
||||||
|
if chunk.get("extra_metadata"):
|
||||||
|
source.update(chunk["extra_metadata"])
|
||||||
|
|
||||||
|
actions.append({
|
||||||
|
"_index": self.index_name,
|
||||||
|
"_id": self._generate_id(i, chunk["chunk"]),
|
||||||
|
"_source": source,
|
||||||
|
})
|
||||||
|
|
||||||
|
return actions
|
||||||
|
|
||||||
|
def write(self, chunks: Union[Chunk, list[Chunk]]) -> list[dict[str, Any]]:
|
||||||
|
"""Write the chunks to the Elasticsearch index using the bulk API."""
|
||||||
|
if isinstance(chunks, Chunk):
|
||||||
|
chunks = [chunks]
|
||||||
|
|
||||||
|
actions = self._create_bulk_actions(chunks)
|
||||||
|
|
||||||
|
# Use the bulk helper to efficiently write the documents
|
||||||
|
from elasticsearch.helpers import bulk
|
||||||
|
|
||||||
|
success, errors = bulk(self.client, actions, raise_on_error=False)
|
||||||
|
|
||||||
|
if errors:
|
||||||
|
logger.warning(f"Encountered {len(errors)} errors during bulk indexing.") # type: ignore
|
||||||
|
# Optionally log the first few errors for debugging
|
||||||
|
for i, error in enumerate(errors[:5]): # type: ignore
|
||||||
|
logger.error(f"Error {i + 1}: {error}")
|
||||||
|
|
||||||
|
logger.info(f"Chonkie wrote {success} chunks to Elasticsearch index: {self.index_name}")
|
||||||
|
|
||||||
|
return actions
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_documents(docs_folder_path: str, docs_extension: list[str]) -> list[Path]:
|
||||||
"""
|
"""
|
||||||
Read files in a folder whose names start with a given prefix.
|
Fetch files from a folder that match the specified extensions.
|
||||||
Replaces javascript language markers with avap.
|
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
folder_path: Path to the folder to search in.
|
docs_folder_path (str): Path to the folder containing documents
|
||||||
file_prefix: The prefix that file names must start with.
|
docs_extension (list[str]): List of file extensions to filter by (e.g., [".md", ".avap"])
|
||||||
If None, all files in the folder are included.
|
|
||||||
concatenate: Whether to concatenate the contents of the files.
|
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
A list of dictionaries, each containing 'content' and 'title' keys.
|
List of Paths to the fetched documents
|
||||||
If concatenate is True, returns a single dict with concatenated content and title as 'appendix'.
|
|
||||||
If concatenate is False, returns one dict per file with filename as title.
|
|
||||||
"""
|
"""
|
||||||
contents = []
|
fetcher = FileFetcher()
|
||||||
filenames = []
|
docs_path = fetcher.fetch(dir=f"{settings.proj_root}/{docs_folder_path}", ext=docs_extension)
|
||||||
|
return docs_path
|
||||||
for filename in sorted(os.listdir(folder_path)):
|
|
||||||
include_file = file_prefix is None or filename.startswith(file_prefix)
|
|
||||||
if include_file:
|
|
||||||
file_path = os.path.join(folder_path, filename)
|
|
||||||
if os.path.isfile(file_path):
|
|
||||||
with open(file_path, "r", encoding="utf-8") as f:
|
|
||||||
content = f.read()
|
|
||||||
cleaned_content = content.strip()
|
|
||||||
if cleaned_content:
|
|
||||||
contents.append(cleaned_content)
|
|
||||||
filenames.append(filename)
|
|
||||||
|
|
||||||
if concatenate:
|
|
||||||
concatenated = "\n".join(contents)
|
|
||||||
processed_content = replace_javascript_with_avap(concatenated)
|
|
||||||
title = file_prefix if file_prefix is not None else "all_files"
|
|
||||||
return [{"content": processed_content, "title": title}]
|
|
||||||
else:
|
|
||||||
return [
|
|
||||||
{"content": replace_javascript_with_avap(content), "title": filename}
|
|
||||||
for content, filename in zip(contents, filenames)
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
def get_chunk_docs(docs: list[dict], chunker: SemanticChunker) -> list[list[Chunk]]:
|
def process_documents(docs_path: list[Path]) -> list[dict[str, Chunk | dict[str, Any]]]:
|
||||||
"""
|
"""
|
||||||
Chunk the content of the documents using the provided chunker.
|
Process documents by applying appropriate chefs and chunking strategies based on file type.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
docs: A list of dictionaries, each containing 'content' and 'title' keys.
|
docs_path (list[Path]): List of Paths to the documents to be processed
|
||||||
chunker: An instance of SemanticChunker to use for chunking the content.
|
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
A list of lists of Chunk objects, where each inner list corresponds to the chunks of a
|
List of dicts with "chunk" (Chunk object) and "metadata" (dict with file info)
|
||||||
single document.
|
|
||||||
"""
|
"""
|
||||||
list_chunks = []
|
processed_docs = []
|
||||||
|
custom_tokenizer = AutoTokenizer.from_pretrained(settings.hf_emb_model_name)
|
||||||
|
chef_md = MarkdownChef(tokenizer=custom_tokenizer)
|
||||||
|
chef_txt = TextChef()
|
||||||
|
chunker = TokenChunker(tokenizer=custom_tokenizer)
|
||||||
|
|
||||||
for doc in docs:
|
for doc_path in docs_path:
|
||||||
content = doc["content"]
|
doc_extension = doc_path.suffix.lower()
|
||||||
chunks = chunker.chunk(content)
|
filename = doc_path.name
|
||||||
for chunk in chunks:
|
|
||||||
chunk.context = {"source": doc["title"]}
|
|
||||||
list_chunks.append(chunks)
|
|
||||||
logger.info(f"Finished chunking {doc['title']}")
|
|
||||||
|
|
||||||
return list_chunks
|
if doc_extension == ".md":
|
||||||
|
processed_doc = chef_md.process(doc_path)
|
||||||
|
fused_doc = _merge_markdown_document(processed_doc)
|
||||||
|
chunked_doc = fused_doc.chunks
|
||||||
|
elif doc_extension == ".avap":
|
||||||
|
processed_doc = chef_txt.process(doc_path)
|
||||||
|
chunked_doc = chunker.chunk(processed_doc.content)
|
||||||
|
else:
|
||||||
|
continue
|
||||||
|
|
||||||
|
for chunk in chunked_doc:
|
||||||
|
processed_docs.append({
|
||||||
|
"chunk": chunk,
|
||||||
|
"extra_metadata": {"file": filename}
|
||||||
|
})
|
||||||
|
|
||||||
|
return processed_docs
|
||||||
|
|
||||||
|
|
||||||
def convert_chunks_to_document(chunks: list[dict] | list[list[Chunk]]) -> list[Document]:
|
def ingest_documents(
|
||||||
|
chunked_docs: list[dict[str, Chunk | dict[str, Any]]],
|
||||||
|
es_index: str,
|
||||||
|
es_request_timeout: int,
|
||||||
|
es_max_retries: int,
|
||||||
|
es_retry_on_timeout: bool,
|
||||||
|
delete_es_index: bool,
|
||||||
|
) -> list[dict[str, Any]]:
|
||||||
"""
|
"""
|
||||||
Convert the chunked content into a list of Document objects.
|
Ingest processed documents into an Elasticsearch index.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
chunks: A list of dictionaries containing 'content' and 'title' keys.
|
chunked_docs (list[dict[str, Any]]): List of dicts with "chunk" and "metadata" keys
|
||||||
|
es_index (str): Name of the Elasticsearch index to ingest into
|
||||||
|
es_request_timeout (int): Timeout for Elasticsearch requests in seconds
|
||||||
|
es_max_retries (int): Maximum number of retries for Elasticsearch requests
|
||||||
|
es_retry_on_timeout (bool): Whether to retry on Elasticsearch request timeouts
|
||||||
|
delete_es_index (bool): Whether to delete the existing Elasticsearch index before ingestion
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
A list of Document objects created from the chunked content.
|
List of dicts with Elasticsearch response for each chunk
|
||||||
"""
|
"""
|
||||||
documents = []
|
logger.info(
|
||||||
|
f"Instantiating Elasticsearch client with URL: {settings.elasticsearch_local_url}..."
|
||||||
|
)
|
||||||
|
es = Elasticsearch(
|
||||||
|
hosts=settings.elasticsearch_local_url,
|
||||||
|
request_timeout=es_request_timeout,
|
||||||
|
max_retries=es_max_retries,
|
||||||
|
retry_on_timeout=es_retry_on_timeout,
|
||||||
|
)
|
||||||
|
|
||||||
if isinstance(chunks[0], dict):
|
if delete_es_index and es.indices.exists(index=es_index):
|
||||||
for chunk in chunks:
|
logger.info(f"Deleting existing Elasticsearch index: {es_index}...")
|
||||||
content = chunk["content"]
|
es.indices.delete(index=es_index)
|
||||||
title = chunk["title"]
|
|
||||||
documents.append(Document(id=str(uuid.uuid4()),
|
|
||||||
page_content=content,
|
|
||||||
metadata={"source": title}))
|
|
||||||
|
|
||||||
else:
|
handshake = ElasticHandshakeWithMetadata(
|
||||||
for chunk_list in chunks:
|
client=es,
|
||||||
for chunk in chunk_list:
|
index_name=es_index,
|
||||||
content = chunk.text
|
embedding_model=OllamaEmbeddings(model=settings.ollama_emb_model_name),
|
||||||
title = chunk.context.get("source", "unknown")
|
)
|
||||||
documents.append(Document(id=str(uuid.uuid4()),
|
|
||||||
page_content=content,
|
|
||||||
metadata={"source": title}))
|
|
||||||
|
|
||||||
return documents
|
logger.info(
|
||||||
|
f"Ingesting {len(chunked_docs)} chunks into Elasticsearch index: {es_index}..."
|
||||||
|
)
|
||||||
|
elasticsearch_chunks = handshake.write(chunked_docs)
|
||||||
|
|
||||||
|
return elasticsearch_chunks
|
||||||
|
|
||||||
|
|
||||||
|
def export_documents(elasticsearch_chunks: list[dict[str, Any]], output_path: str) -> None:
|
||||||
|
"""
|
||||||
|
Export processed documents to JSON files in the specified output folder.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
elasticsearch_chunks (list[dict[str, Any]]): List of dicts with Elasticsearch response for each chunk
|
||||||
|
output_path (str): Path to the file where the JSON will be saved
|
||||||
|
Returns:
|
||||||
|
None
|
||||||
|
"""
|
||||||
|
output_path = settings.proj_root / output_path
|
||||||
|
|
||||||
|
for chunk in elasticsearch_chunks:
|
||||||
|
chunk["_source"]["embedding"] = chunk["_source"]["embedding"].tolist() # For JSON serialization
|
||||||
|
|
||||||
|
with output_path.open("w", encoding="utf-8") as f:
|
||||||
|
json.dump(elasticsearch_chunks, f, ensure_ascii=False, indent=4)
|
||||||
|
|
||||||
|
logger.info(f"Exported processed documents to {output_path}")
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,125 @@
|
||||||
|
import requests
|
||||||
|
from typing import Any, Callable
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
from chonkie.embeddings import BaseEmbeddings
|
||||||
|
|
||||||
|
from src.config import settings
|
||||||
|
|
||||||
|
|
||||||
|
class OllamaEmbeddings(BaseEmbeddings):
|
||||||
|
"""Chonkie embeddings adapter for a local Ollama embedding model."""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
model: str,
|
||||||
|
base_url: str = settings.ollama_local_url,
|
||||||
|
timeout: float = 60.0,
|
||||||
|
truncate: bool = True,
|
||||||
|
keep_alive: str = "5m",
|
||||||
|
) -> None:
|
||||||
|
self.model = model
|
||||||
|
self.base_url = base_url.rstrip("/")
|
||||||
|
self.timeout = timeout
|
||||||
|
self.truncate = truncate
|
||||||
|
self.keep_alive = keep_alive
|
||||||
|
self._dimension: int | None = None
|
||||||
|
|
||||||
|
@property
|
||||||
|
def dimension(self) -> int:
|
||||||
|
if self._dimension is None:
|
||||||
|
# Lazy-load the dimension from a real embedding response.
|
||||||
|
self._dimension = int(self.embed(" ").shape[0])
|
||||||
|
return self._dimension
|
||||||
|
|
||||||
|
def embed(self, text: str) -> np.ndarray:
|
||||||
|
embeddings = self._embed_api(text)
|
||||||
|
vector = np.asarray(embeddings[0], dtype=np.float32)
|
||||||
|
|
||||||
|
if self._dimension is None:
|
||||||
|
self._dimension = int(vector.shape[0])
|
||||||
|
|
||||||
|
return vector
|
||||||
|
|
||||||
|
def embed_batch(self, texts: list[str]) -> list[np.ndarray]:
|
||||||
|
if not texts:
|
||||||
|
return []
|
||||||
|
|
||||||
|
embeddings = self._embed_api(texts)
|
||||||
|
vectors = [np.asarray(vector, dtype=np.float32) for vector in embeddings]
|
||||||
|
|
||||||
|
if vectors and self._dimension is None:
|
||||||
|
self._dimension = int(vectors[0].shape[0])
|
||||||
|
|
||||||
|
return vectors
|
||||||
|
|
||||||
|
def count_tokens(self, text: str) -> int:
|
||||||
|
payload = self._build_payload(text)
|
||||||
|
response = self._post_embed(payload)
|
||||||
|
return int(response["prompt_eval_count"])
|
||||||
|
|
||||||
|
def count_tokens_batch(self, texts: list[str]) -> list[int]:
|
||||||
|
# Ollama returns a single prompt_eval_count for the whole request,
|
||||||
|
# not one count per input item, so we compute them individually.
|
||||||
|
return [self.count_tokens(text) for text in texts]
|
||||||
|
|
||||||
|
def get_tokenizer(self) -> Callable[[str], int]:
|
||||||
|
# Chonkie mainly needs something usable for token counting.
|
||||||
|
return self.count_tokens
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def is_available(cls) -> bool:
|
||||||
|
try:
|
||||||
|
response = requests.get(
|
||||||
|
f"{settings.ollama_local_url}/api/tags",
|
||||||
|
timeout=5.0,
|
||||||
|
)
|
||||||
|
response.raise_for_status()
|
||||||
|
return True
|
||||||
|
except requests.RequestException:
|
||||||
|
return False
|
||||||
|
|
||||||
|
def __repr__(self) -> str:
|
||||||
|
return (
|
||||||
|
f"OllamaEmbeddings("
|
||||||
|
f"model={self.model!r}, "
|
||||||
|
f"base_url={self.base_url!r}, "
|
||||||
|
f"dimension={self._dimension!r}"
|
||||||
|
f")"
|
||||||
|
)
|
||||||
|
|
||||||
|
def _build_payload(self, text_or_texts: str | list[str]) -> dict[str, Any]:
|
||||||
|
return {
|
||||||
|
"model": self.model,
|
||||||
|
"input": text_or_texts,
|
||||||
|
"truncate": self.truncate,
|
||||||
|
"keep_alive": self.keep_alive,
|
||||||
|
}
|
||||||
|
|
||||||
|
def _post_embed(self, payload: dict[str, Any]) -> dict[str, Any]:
|
||||||
|
try:
|
||||||
|
response = requests.post(
|
||||||
|
f"{self.base_url}/api/embed",
|
||||||
|
json=payload,
|
||||||
|
timeout=self.timeout,
|
||||||
|
)
|
||||||
|
response.raise_for_status()
|
||||||
|
data = response.json()
|
||||||
|
except requests.RequestException as exc:
|
||||||
|
raise RuntimeError(
|
||||||
|
f"Failed to call Ollama embeddings endpoint at "
|
||||||
|
f"{self.base_url}/api/embed"
|
||||||
|
) from exc
|
||||||
|
|
||||||
|
if "embeddings" not in data:
|
||||||
|
raise RuntimeError(
|
||||||
|
"Ollama response did not include 'embeddings'. "
|
||||||
|
f"Response keys: {list(data.keys())}"
|
||||||
|
)
|
||||||
|
|
||||||
|
return data
|
||||||
|
|
||||||
|
def _embed_api(self, text_or_texts: str | list[str]) -> list[list[float]]:
|
||||||
|
payload = self._build_payload(text_or_texts)
|
||||||
|
data = self._post_embed(payload)
|
||||||
|
return data["embeddings"]
|
||||||
144
src/config.py
144
src/config.py
|
|
@ -1,39 +1,29 @@
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
from pydantic_settings import BaseSettings, SettingsConfigDict
|
from pydantic_settings import BaseSettings, SettingsConfigDict
|
||||||
from pydantic import Field
|
|
||||||
from dotenv import load_dotenv
|
|
||||||
from datetime import timedelta
|
|
||||||
import warnings
|
|
||||||
|
|
||||||
|
|
||||||
load_dotenv()
|
|
||||||
|
|
||||||
class Settings(BaseSettings):
|
class Settings(BaseSettings):
|
||||||
raw_path_: str
|
data_path_: Optional[str] = None
|
||||||
data_path_: str
|
raw_path_: Optional[str] = None
|
||||||
processed_path_: str
|
processed_path_: Optional[str] = None
|
||||||
models_path_: str
|
models_path_: Optional[str] = None
|
||||||
external_path_: str
|
external_path_: Optional[str] = None
|
||||||
kubeconfig_path: str
|
interim_path_: Optional[str] = None
|
||||||
interim_path_: str
|
kubeconfig_path_: Optional[str] = None
|
||||||
database_url: str
|
postgres_url: str
|
||||||
openai_api_key: str
|
|
||||||
elasticsearch_index: str
|
|
||||||
elasticsearch_docs_index: str
|
|
||||||
elasticsearch_code_index: str
|
|
||||||
llm_base_url: str
|
|
||||||
ollama_url: str
|
|
||||||
ollama_local_url: str
|
|
||||||
langfuse_host: str
|
|
||||||
elasticsearch_url: str
|
elasticsearch_url: str
|
||||||
elasticsearch_local_url: str
|
elasticsearch_local_url: str
|
||||||
|
ollama_url: str
|
||||||
|
ollama_local_url: str
|
||||||
ollama_model_name: str
|
ollama_model_name: str
|
||||||
ollama_emb_model_name: str
|
ollama_emb_model_name: str
|
||||||
model_name: str
|
langfuse_host: str
|
||||||
hf_emb_model_name: str
|
|
||||||
langfuse_public_key: str
|
langfuse_public_key: str
|
||||||
langfuse_secret_key: str
|
langfuse_secret_key: str
|
||||||
hf_token: str
|
hf_token: str
|
||||||
|
hf_emb_model_name: str
|
||||||
|
|
||||||
model_config = SettingsConfigDict(
|
model_config = SettingsConfigDict(
|
||||||
env_file=".env",
|
env_file=".env",
|
||||||
|
|
@ -43,108 +33,40 @@ class Settings(BaseSettings):
|
||||||
)
|
)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def data_path(self) -> Path:
|
def project_root(self) -> Path:
|
||||||
return Path(self.data_path_)
|
|
||||||
|
|
||||||
@property
|
|
||||||
def models_path(self) -> Path:
|
|
||||||
return Path(self.models_path_)
|
|
||||||
|
|
||||||
@property
|
|
||||||
def processed_path(self) -> Path:
|
|
||||||
return Path(self.processed_path_)
|
|
||||||
|
|
||||||
@property
|
|
||||||
def raw_path(self) -> Path:
|
|
||||||
return Path(self.raw_path_)
|
|
||||||
|
|
||||||
@property
|
|
||||||
def interim_path(self) -> Path:
|
|
||||||
return Path(self.interim_path_)
|
|
||||||
|
|
||||||
@property
|
|
||||||
def external_path(self) -> Path:
|
|
||||||
return Path(self.external_path_)
|
|
||||||
|
|
||||||
@property
|
|
||||||
def proj_root(self) -> Path:
|
|
||||||
return Path(__file__).resolve().parents[1]
|
return Path(__file__).resolve().parents[1]
|
||||||
|
|
||||||
@property
|
def _resolve_path(self, path: Optional[str]) -> Optional[Path]:
|
||||||
def database_url(self) -> str:
|
if path is None:
|
||||||
return self.database_url
|
return None
|
||||||
|
return self.project_root / path
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def openai_api_key(self) -> str:
|
def data_path(self) -> Optional[Path]:
|
||||||
return self.openai_api_key
|
return self._resolve_path(self.data_path_)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def elasticsearch_index(self) -> str:
|
def raw_path(self) -> Optional[Path]:
|
||||||
return self.elasticsearch_index
|
return self._resolve_path(self.raw_path_)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def elasticsearch_docs_index(self) -> str:
|
def processed_path(self) -> Optional[Path]:
|
||||||
return self.elasticsearch_docs_index
|
return self._resolve_path(self.processed_path_)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def elasticsearch_code_index(self) -> str:
|
def models_path(self) -> Optional[Path]:
|
||||||
return self.elasticsearch_code_index
|
return self._resolve_path(self.models_path_)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def llm_base_url(self) -> str:
|
def external_path(self) -> Optional[Path]:
|
||||||
return self.llm_base_url
|
return self._resolve_path(self.external_path_)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def ollama_url(self) -> str:
|
def interim_path(self) -> Optional[Path]:
|
||||||
return self.ollama_url
|
return self._resolve_path(self.interim_path_)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def ollama_local_url(self) -> str:
|
def kubeconfig_path(self) -> Optional[Path]:
|
||||||
return self.ollama_local_url
|
return self._resolve_path(self.kubeconfig_path_)
|
||||||
|
|
||||||
@property
|
|
||||||
def langfuse_host(self) -> str:
|
|
||||||
return self.langfuse_host
|
|
||||||
|
|
||||||
@property
|
|
||||||
def elasticsearch_url(self) -> str:
|
|
||||||
return self.elasticsearch_url
|
|
||||||
|
|
||||||
@property
|
|
||||||
def elasticsearch_local_url(self) -> str:
|
|
||||||
return self.elasticsearch_local_url
|
|
||||||
|
|
||||||
@property
|
|
||||||
def ollama_model_name(self) -> str:
|
|
||||||
return self.ollama_model_name
|
|
||||||
|
|
||||||
@property
|
|
||||||
def ollama_emb_model_name(self) -> str:
|
|
||||||
return self.ollama_emb_model_name
|
|
||||||
|
|
||||||
@property
|
|
||||||
def model_name(self) -> str:
|
|
||||||
return self.model_name
|
|
||||||
|
|
||||||
@property
|
|
||||||
def hf_emb_model_name(self) -> str:
|
|
||||||
return self.hf_emb_model_name
|
|
||||||
|
|
||||||
@property
|
|
||||||
def langfuse_public_key(self) -> str:
|
|
||||||
return self.langfuse_public_key
|
|
||||||
|
|
||||||
@property
|
|
||||||
def langfuse_secret_key(self) -> str:
|
|
||||||
return self.langfuse_secret_key
|
|
||||||
|
|
||||||
@property
|
|
||||||
def hf_token(self) -> str:
|
|
||||||
return self.hf_token
|
|
||||||
|
|
||||||
@property
|
|
||||||
def kubeconfig_path(self) -> Path:
|
|
||||||
return Path(self.kubeconfig_path)
|
|
||||||
|
|
||||||
|
|
||||||
settings = Settings()
|
settings = Settings()
|
||||||
29
uv.lock
29
uv.lock
|
|
@ -250,7 +250,6 @@ name = "assistance-engine"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
source = { virtual = "." }
|
source = { virtual = "." }
|
||||||
dependencies = [
|
dependencies = [
|
||||||
{ name = "chonkie", extra = ["semantic"] },
|
|
||||||
{ name = "grpcio" },
|
{ name = "grpcio" },
|
||||||
{ name = "grpcio-reflection" },
|
{ name = "grpcio-reflection" },
|
||||||
{ name = "grpcio-tools" },
|
{ name = "grpcio-tools" },
|
||||||
|
|
@ -273,7 +272,9 @@ dependencies = [
|
||||||
dev = [
|
dev = [
|
||||||
{ name = "beir" },
|
{ name = "beir" },
|
||||||
{ name = "boto3" },
|
{ name = "boto3" },
|
||||||
|
{ name = "chonkie", extra = ["elastic", "semantic"] },
|
||||||
{ name = "evidently" },
|
{ name = "evidently" },
|
||||||
|
{ name = "flatbuffers" },
|
||||||
{ name = "jupyter" },
|
{ name = "jupyter" },
|
||||||
{ name = "langfuse" },
|
{ name = "langfuse" },
|
||||||
{ name = "litellm" },
|
{ name = "litellm" },
|
||||||
|
|
@ -288,7 +289,6 @@ dev = [
|
||||||
|
|
||||||
[package.metadata]
|
[package.metadata]
|
||||||
requires-dist = [
|
requires-dist = [
|
||||||
{ name = "chonkie", extras = ["semantic"], specifier = ">=1.5.6" },
|
|
||||||
{ name = "grpcio", specifier = ">=1.78.0" },
|
{ name = "grpcio", specifier = ">=1.78.0" },
|
||||||
{ name = "grpcio-reflection", specifier = ">=1.78.0" },
|
{ name = "grpcio-reflection", specifier = ">=1.78.0" },
|
||||||
{ name = "grpcio-tools", specifier = ">=1.78.0" },
|
{ name = "grpcio-tools", specifier = ">=1.78.0" },
|
||||||
|
|
@ -311,7 +311,9 @@ requires-dist = [
|
||||||
dev = [
|
dev = [
|
||||||
{ name = "beir", specifier = ">=2.2.0" },
|
{ name = "beir", specifier = ">=2.2.0" },
|
||||||
{ name = "boto3", specifier = ">=1.42.58" },
|
{ name = "boto3", specifier = ">=1.42.58" },
|
||||||
|
{ name = "chonkie", extras = ["elastic", "semantic"], specifier = ">=1.6.0" },
|
||||||
{ name = "evidently", specifier = ">=0.7.20" },
|
{ name = "evidently", specifier = ">=0.7.20" },
|
||||||
|
{ name = "flatbuffers", specifier = ">=25.12.19" },
|
||||||
{ name = "jupyter", specifier = ">=1.1.1" },
|
{ name = "jupyter", specifier = ">=1.1.1" },
|
||||||
{ name = "langfuse", specifier = "<3" },
|
{ name = "langfuse", specifier = "<3" },
|
||||||
{ name = "litellm", specifier = ">=1.82.0" },
|
{ name = "litellm", specifier = ">=1.82.0" },
|
||||||
|
|
@ -595,7 +597,7 @@ wheels = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "chonkie"
|
name = "chonkie"
|
||||||
version = "1.5.6"
|
version = "1.6.0"
|
||||||
source = { registry = "https://pypi.org/simple" }
|
source = { registry = "https://pypi.org/simple" }
|
||||||
dependencies = [
|
dependencies = [
|
||||||
{ name = "chonkie-core" },
|
{ name = "chonkie-core" },
|
||||||
|
|
@ -603,12 +605,15 @@ dependencies = [
|
||||||
{ name = "tenacity" },
|
{ name = "tenacity" },
|
||||||
{ name = "tqdm" },
|
{ name = "tqdm" },
|
||||||
]
|
]
|
||||||
sdist = { url = "https://files.pythonhosted.org/packages/a4/16/e51295955f5a627ebb7867dc2e7fa48d4c6dc2a5f3cde3690de84812e929/chonkie-1.5.6.tar.gz", hash = "sha256:282a24c20b88c4c28d8cae893ac78bcbee531a87d28ec86b419897a9eea2ecf3", size = 172066, upload-time = "2026-02-16T21:44:01.336Z" }
|
sdist = { url = "https://files.pythonhosted.org/packages/e5/72/fdf8f89ff439f4ec357af0866c819512391936e4e61b6f15635a48434b8a/chonkie-1.6.0.tar.gz", hash = "sha256:14120d80610c1f549027fc7aa9a5ff604a729b545836f6cadd65d5ae83596279", size = 187056, upload-time = "2026-03-11T04:55:07.657Z" }
|
||||||
wheels = [
|
wheels = [
|
||||||
{ url = "https://files.pythonhosted.org/packages/18/3a/24cf4cb377f4d44126231d55a19b48a645a0f78f891288a8d4300c95160d/chonkie-1.5.6-py3-none-any.whl", hash = "sha256:4c3be39a0f97315eb3c5efe6dc5d7933d3d27a1918b55c39ab211b403bb03df7", size = 210065, upload-time = "2026-02-16T21:43:59.926Z" },
|
{ url = "https://files.pythonhosted.org/packages/ae/c2/7ea7d3409df220dd0e048b1113b44f47eccab9d517b00b037ab0e34c3c7a/chonkie-1.6.0-py3-none-any.whl", hash = "sha256:aa357e02f5cdacac6f8280c5e8651207c866b4137bcf20904db8670ee0808877", size = 232997, upload-time = "2026-03-11T04:55:05.252Z" },
|
||||||
]
|
]
|
||||||
|
|
||||||
[package.optional-dependencies]
|
[package.optional-dependencies]
|
||||||
|
elastic = [
|
||||||
|
{ name = "elasticsearch" },
|
||||||
|
]
|
||||||
semantic = [
|
semantic = [
|
||||||
{ name = "model2vec" },
|
{ name = "model2vec" },
|
||||||
{ name = "tokenizers" },
|
{ name = "tokenizers" },
|
||||||
|
|
@ -1061,6 +1066,14 @@ wheels = [
|
||||||
{ url = "https://files.pythonhosted.org/packages/9c/0f/5d0c71a1aefeb08efff26272149e07ab922b64f46c63363756224bd6872e/filelock-3.24.3-py3-none-any.whl", hash = "sha256:426e9a4660391f7f8a810d71b0555bce9008b0a1cc342ab1f6947d37639e002d", size = 24331, upload-time = "2026-02-19T00:48:18.465Z" },
|
{ url = "https://files.pythonhosted.org/packages/9c/0f/5d0c71a1aefeb08efff26272149e07ab922b64f46c63363756224bd6872e/filelock-3.24.3-py3-none-any.whl", hash = "sha256:426e9a4660391f7f8a810d71b0555bce9008b0a1cc342ab1f6947d37639e002d", size = 24331, upload-time = "2026-02-19T00:48:18.465Z" },
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "flatbuffers"
|
||||||
|
version = "25.12.19"
|
||||||
|
source = { registry = "https://pypi.org/simple" }
|
||||||
|
wheels = [
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/e8/2d/d2a548598be01649e2d46231d151a6c56d10b964d94043a335ae56ea2d92/flatbuffers-25.12.19-py2.py3-none-any.whl", hash = "sha256:7634f50c427838bb021c2d66a3d1168e9d199b0607e6329399f04846d42e20b4", size = 26661, upload-time = "2025-12-19T23:16:13.622Z" },
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "fqdn"
|
name = "fqdn"
|
||||||
version = "1.5.1"
|
version = "1.5.1"
|
||||||
|
|
@ -3112,14 +3125,14 @@ wheels = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "opentelemetry-proto"
|
name = "opentelemetry-proto"
|
||||||
version = "1.39.1"
|
version = "1.40.0"
|
||||||
source = { registry = "https://pypi.org/simple" }
|
source = { registry = "https://pypi.org/simple" }
|
||||||
dependencies = [
|
dependencies = [
|
||||||
{ name = "protobuf" },
|
{ name = "protobuf" },
|
||||||
]
|
]
|
||||||
sdist = { url = "https://files.pythonhosted.org/packages/49/1d/f25d76d8260c156c40c97c9ed4511ec0f9ce353f8108ca6e7561f82a06b2/opentelemetry_proto-1.39.1.tar.gz", hash = "sha256:6c8e05144fc0d3ed4d22c2289c6b126e03bcd0e6a7da0f16cedd2e1c2772e2c8", size = 46152, upload-time = "2025-12-11T13:32:48.681Z" }
|
sdist = { url = "https://files.pythonhosted.org/packages/4c/77/dd38991db037fdfce45849491cb61de5ab000f49824a00230afb112a4392/opentelemetry_proto-1.40.0.tar.gz", hash = "sha256:03f639ca129ba513f5819810f5b1f42bcb371391405d99c168fe6937c62febcd", size = 45667, upload-time = "2026-03-04T14:17:31.194Z" }
|
||||||
wheels = [
|
wheels = [
|
||||||
{ url = "https://files.pythonhosted.org/packages/51/95/b40c96a7b5203005a0b03d8ce8cd212ff23f1793d5ba289c87a097571b18/opentelemetry_proto-1.39.1-py3-none-any.whl", hash = "sha256:22cdc78efd3b3765d09e68bfbd010d4fc254c9818afd0b6b423387d9dee46007", size = 72535, upload-time = "2025-12-11T13:32:33.866Z" },
|
{ url = "https://files.pythonhosted.org/packages/b9/b2/189b2577dde745b15625b3214302605b1353436219d42b7912e77fa8dc24/opentelemetry_proto-1.40.0-py3-none-any.whl", hash = "sha256:266c4385d88923a23d63e353e9761af0f47a6ed0d486979777fe4de59dc9b25f", size = 72073, upload-time = "2026-03-04T14:17:16.673Z" },
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue