Merge branch 'mrh-online-dev' of github.com:BRUNIX-AI/assistance-engine into mrh-online-dev
This commit is contained in:
commit
ba4a1f1efc
|
|
@ -35,7 +35,7 @@ class BrunixEngine(brunix_pb2_grpc.AssistanceEngineServicer):
|
|||
index_name=os.getenv("ELASTICSEARCH_INDEX"),
|
||||
embedding=self.embeddings,
|
||||
query_field="text",
|
||||
vector_query_field="vector",
|
||||
vector_query_field="embedding",
|
||||
)
|
||||
self.graph = build_graph(
|
||||
llm=self.llm,
|
||||
|
|
|
|||
45
README.md
45
README.md
|
|
@ -45,13 +45,15 @@ graph TD
|
|||
|
||||
├── README.md # System documentation & Dev guide
|
||||
├── changelog # Version tracking and release history
|
||||
├── pyproject.toml # Python project configuration
|
||||
├── pyproject.toml
|
||||
├── ingestion/ # Data ingested in Elasticsearch
|
||||
├── docs/
|
||||
| ├── AVAP Language: ... # AVAP DSL Documentation
|
||||
| | └── AVAP.md
|
||||
│ ├── AVAP Language: ... # AVAP DSL Documentation
|
||||
│ │ └── AVAP.md
|
||||
│ ├── developer.avapfr... # Documents on developer web page
|
||||
| └── LRM/ # AVAP LRM documentation
|
||||
| └── avap.md
|
||||
│ ├── LRM/ # AVAP LRM documentation
|
||||
│ │ └── avap.md
|
||||
│ └── samples/ # AVAP code samples
|
||||
├── Docker/
|
||||
│ ├── protos/
|
||||
│ │ └── brunix.proto # Protocol Buffers: The source of truth for the API
|
||||
|
|
@ -64,30 +66,16 @@ graph TD
|
|||
│ ├── Dockerfile # Container definition for the Engine
|
||||
│ ├── docker-compose.yaml # Local orchestration for dev environment
|
||||
│ ├── requirements.txt # Python dependencies for Docker
|
||||
│ ├── protos/
|
||||
│ │ └── brunix.proto # Protocol Buffers: The source of truth for the API
|
||||
│ └── src/
|
||||
│ ├── graph.py # Workflow graph orchestration
|
||||
│ ├── prompts.py # Centralized prompt definitions
|
||||
│ ├── server.py # gRPC Server & RAG Orchestration
|
||||
│ ├── state.py # Shared state management
|
||||
│ └── utils/ # Utility modules
|
||||
├── ingestion/
|
||||
│ └── docs/ # AVAP documentation chunks
|
||||
├── kubernetes/
|
||||
│ └── kubeconfig.yaml # Kubernetes cluster configuration
|
||||
│ └── .dockerignore # Docker ignore files
|
||||
├── scripts/
|
||||
│ └── pipelines/
|
||||
| ├── samples_generator/ # AVAP Sample generator
|
||||
| | └─ generate_mbap.py
|
||||
│ └── flows/ # Data processing flows
|
||||
| └─ elasticsearch_ingestion.py
|
||||
│ ├── flows/ # Processing pipelines
|
||||
│ └── tasks/ # Modules used by the flows
|
||||
└── src/
|
||||
├── __init__.py
|
||||
├── config.py # Environment variables configuration file
|
||||
└── utils/
|
||||
├── emb_factory.py # Embedding model factory
|
||||
├── llm_factory.py # LLM model factory
|
||||
└── __init__.py
|
||||
└── llm_factory.py # LLM model factory
|
||||
```
|
||||
|
||||
---
|
||||
|
|
@ -146,6 +134,7 @@ The engine utilizes Langfuse for end-to-end tracing and performance monitoring.
|
|||
Create a `.env` file in the project root with the following variables:
|
||||
|
||||
```env
|
||||
PYTHONPATH=${PYTHONPATH}:/home/...
|
||||
ELASTICSEARCH_URL=http://host.docker.internal:9200
|
||||
ELASTICSEARCH_LOCAL_URL=http://localhost:9200
|
||||
ELASTICSEARCH_INDEX=avap-docs-test
|
||||
|
|
@ -157,11 +146,13 @@ OLLAMA_URL=http://host.docker.internal:11434
|
|||
OLLAMA_LOCAL_URL=http://localhost:11434
|
||||
OLLAMA_MODEL_NAME=qwen2.5:1.5b
|
||||
OLLAMA_EMB_MODEL_NAME=qwen3-0.6B-emb:latest
|
||||
HF_TOKEN=hf_...
|
||||
HF_EMB_MODEL_NAME=Qwen/Qwen3-Embedding-0.6B
|
||||
```
|
||||
|
||||
| Variable | Required | Description | Example |
|
||||
|---|---|---|---|
|
||||
| `PYTHONPATH` | No | Path that aims to the root of the project | `${PYTHONPATH}:/home/...` |
|
||||
| `ELASTICSEARCH_URL` | Yes | Elasticsearch endpoint used for vector/context retrieval in Docker | `http://host.docker.internal:9200` |
|
||||
| `ELASTICSEARCH_LOCAL_URL` | Yes | Elasticsearch endpoint used for vector/context retrieval in local | `http://localhost:9200` |
|
||||
| `ELASTICSEARCH_INDEX` | Yes | Elasticsearch index name used by the engine | `avap-docs-test` |
|
||||
|
|
@ -183,13 +174,13 @@ Open a terminal and establish the connection to the Devaron Cluster:
|
|||
|
||||
```bash
|
||||
# 1. AI Model Tunnel (Ollama)
|
||||
kubectl port-forward --address 0.0.0.0 svc/ollama-light-service 11434:11434 -n brunix --kubeconfig ./kubernetes/ivar.yaml &
|
||||
kubectl port-forward --address 0.0.0.0 svc/ollama-light-service 11434:11434 -n brunix --kubeconfig ./kubernetes/kubeconfig.yaml &
|
||||
|
||||
# 2. Knowledge Base Tunnel (Elasticsearch)
|
||||
kubectl port-forward --address 0.0.0.0 svc/brunix-vector-db 9200:9200 -n brunix --kubeconfig ./kubernetes/ivar.yaml &
|
||||
kubectl port-forward --address 0.0.0.0 svc/brunix-vector-db 9200:9200 -n brunix --kubeconfig ./kubernetes/kubeconfig.yaml &
|
||||
|
||||
# 3. Observability DB Tunnel (PostgreSQL)
|
||||
kubectl port-forward --address 0.0.0.0 svc/brunix-postgres 5432:5432 -n brunix --kubeconfig ./kubernetes/ivar.yaml &
|
||||
kubectl port-forward --address 0.0.0.0 svc/brunix-postgres 5432:5432 -n brunix --kubeconfig ./kubernetes/kubeconfig.yaml &
|
||||
```
|
||||
|
||||
### 5. Launch the Engine
|
||||
|
|
|
|||
|
|
@ -4,24 +4,23 @@ All notable changes to the **Brunix Assistance Engine** will be documented in th
|
|||
|
||||
---
|
||||
|
||||
## [1.5.0] - 2026-03-11
|
||||
## [1.5.0] - 2026-03-12
|
||||
|
||||
### Added
|
||||
- IMPLEMENTED:
|
||||
- `scripts/pipelines/flows/translate_mbpp.py`: pipeline to generate synthethic dataset from mbpp dataset.
|
||||
- `scripts/input/prompts.py`: module containing prompts for pipelines.
|
||||
- `scripts/tasks/prompts.py`: module containing prompts for pipelines.
|
||||
- `scripts/tasks/chunk.py`: module containing functions related to chunk management.
|
||||
- `synthethic_datasets`: folder containing generated synthethic datasets.
|
||||
- `src/config.py`: environment variables configuration file.
|
||||
|
||||
### Changed
|
||||
- REFACTORED: `scripts/pipelines/flows/elasticsearch_ingestion.py` now uses `docs` documents instead of pre chunked files.
|
||||
- REFACTORED: `scripts/pipelines/flows/elasticsearch_ingestion.py` now uses `docs/LRM` or `docs/samples` documents instead of pre chunked files.
|
||||
- RENAMED `docs/AVAP Language: Core Commands & Functional Specification` to `docs/avap_language_github_docs`.
|
||||
- REMOVED: `Makefile` file.
|
||||
- REMOVED: `scripts/start-tunnels.sh` script.
|
||||
- REMOVED `ingestion` folder.
|
||||
- DEPENDENCIES: `requirements.txt` updated with new libraries required by the new modules.
|
||||
- MOVED `scripts/generate_mbpp_avap.py` into `scripts/flows/generate_mbpp_avap.py`
|
||||
- MOVED `scripts/generate_mbap.py` into `scripts/flows/generate_mbap.py`.
|
||||
|
||||
|
||||
## [1.4.0] - 2026-03-10
|
||||
|
|
|
|||
2305
docs/AVAP_dev.md
2305
docs/AVAP_dev.md
File diff suppressed because it is too large
Load Diff
159
docs/LRM/avap.md
159
docs/LRM/avap.md
|
|
@ -115,7 +115,41 @@ AVAP utiliza una gramática estructural mixta. Combina la fluidez de las palabra
|
|||
La estructura `if()` evalúa una expresión lógica o de comparación. Todo bloque condicional requiere un cierre explícito utilizando el comando `end()`.
|
||||
|
||||
El comando `if()` soporta dos modos de invocación:
|
||||
* **Modo 1 (comparación estructurada):** `if(variable, valor, comparador)` — evalúa la comparación entre variable y valor usando el operador indicado como string (ej. `"=="`, `">"`, `"!="`). Los dos primeros argumentos deben ser identificadores simples o literales, nunca expresiones de acceso como `dict['clave']`. Si se necesita comparar un valor extraído de una estructura, debe asignarse primero a una variable.* **Modo 2 (expresión libre):** `if(None, None, "expresion_compleja")` — evalúa directamente una expresión booleana compleja proporcionada como string.
|
||||
* **Modo 1 (comparación estructurada):** `if(variable, valor, comparador)` — evalúa la comparación entre variable y valor usando el operador indicado como string (ej. `"=="`, `">"`, `"!="`). Los dos primeros argumentos deben ser identificadores simples o literales, nunca expresiones de acceso como `dict['clave']`. Si se necesita comparar un valor extraído de una estructura, debe asignarse primero a una variable.* **Modo 2 (expresión libre):** `if(None, None, expresion_compleja)` — evalúa directamente una expresión booleana compleja proporcionada como string encapsulado entre `.
|
||||
|
||||
## SECCIÓN III: Lógica de Control y Estructuras de Decisión
|
||||
|
||||
AVAP utiliza una gramática estructural mixta. Combina la fluidez de las palabras clave para abrir bloques funcionales con la seguridad matemática de cierres estrictos.
|
||||
|
||||
### 3.1 El Bloque Condicional (if() / else() / end())
|
||||
El comando `if()` gestiona la lógica condicional mediante dos modos de invocación estrictamente diferenciados. Es imperativo respetar los delimitadores y la posición de los argumentos.
|
||||
|
||||
#### Modo 1: Comparación Estructurada (Atómica)
|
||||
Se utiliza para comparaciones directas entre dos valores simples.
|
||||
* **Sintaxis:** `if(átomo_1, átomo_2, "operador")`
|
||||
* **Argumentos 1 y 2:** Deben ser identificadores simples (variables) o literales (strings/números). **No se permite el uso de `None` en este modo.**
|
||||
* **Argumento 3:** El operador de comparación debe ir obligatoriamente entre **comillas dobles** (`"=="`, `"!="`, `">"`, `"<"`, `">="`, `"<="`).
|
||||
* **Restricción:** No se permiten expresiones de acceso (ej. `data.user` o `list[0]`). Estos valores deben asignarse previamente a una variable.
|
||||
* **Ejemplo correcto:** `if(reintentos, 5, "<")`
|
||||
|
||||
#### Modo 2: Expresión Libre (Evaluación Compleja)
|
||||
Se utiliza para evaluar expresiones lógicas que no encajan en la estructura atómica.
|
||||
* **Sintaxis:** `if(None, None, `expresión_compleja`)`
|
||||
* **Argumentos 1 y 2:** Deben ser literalmente la palabra `None` (sin comillas).
|
||||
* **Argumento 3:** La expresión completa **debe** estar encapsulada entre **acentos graves (backticks)**. Esto permite incluir lógica interna, operadores `and/or` y accesos a estructuras de datos.
|
||||
* **Ejemplo correcto:** `if(None, None, `user.id > 10 and email.contains("@")`)`
|
||||
|
||||
---
|
||||
|
||||
### Tabla de Validación para el Modelo
|
||||
|
||||
| Entrada | Estado | Razón |
|
||||
| :--- | :--- | :--- |
|
||||
| `if(count, 10, "==")` | ✅ VÁLIDO | Modo 1: Átomos válidos y operador entre comillas. |
|
||||
| `if(None, None, `val > 0`)` | ✅ VÁLIDO | Modo 2: Uso correcto de `None` y backticks. |
|
||||
| `if(username, None, "==")` | ❌ ERROR | El Modo 1 prohíbe el uso de `None`. Debe usarse el Modo 2. |
|
||||
| `if(None, None, "val > 0")` | ❌ ERROR | El Modo 2 requiere backticks (`` ` ``), no comillas. |
|
||||
| `if(user.id, 10, "==")` | ❌ ERROR | El Modo 1 no permite expresiones de acceso (`.`). |
|
||||
|
||||
### 3.2 Iteraciones Estrictas y Deterministas (startLoop / endLoop)
|
||||
Para garantizar el determinismo y evitar el colapso de memoria:
|
||||
|
|
@ -137,15 +171,17 @@ Diseñada para proteger la estabilidad del servidor ante fallos de I/O.
|
|||
[ "else()" <EOL> <block> ]
|
||||
"end()" <EOL>
|
||||
|
||||
/* if() soporta dos modos:
|
||||
Modo 1 — comparación estructurada: los dos primeros argumentos deben ser
|
||||
identificadores simples o literales, nunca expresiones de acceso.
|
||||
Si se necesita comparar un valor extraído de una estructura (ej. dict['clave']),
|
||||
debe asignarse previamente a una variable.
|
||||
Modo 2 — expresión libre: None, None, expresión compleja como string */
|
||||
<if_condition> ::= <if_atom> "," <if_atom> "," <stringliteral>
|
||||
| "None" "," "None" "," <stringliteral>
|
||||
<if_atom> ::= <identifier> | <literal>
|
||||
<if_condition> ::= <if_structured> | <if_free_expression>
|
||||
|
||||
<if_structured> ::= "if" "(" <strict_atom> "," <strict_atom> "," <backtick_string> ")"
|
||||
<if_free_expression> ::= "if" "(" "None" "," "None" "," <backtick_string> ")"
|
||||
|
||||
<strict_atom> ::= <identifier> | <non_null_literal>
|
||||
<backtick_string> ::= "`" <text_content> "`"
|
||||
|
||||
<identifier> ::= [a-zA-Z_][a-zA-Z0-9_]*
|
||||
<non_null_literal>::= <number> | <string_literal_double_quotes>
|
||||
/* Nota: <non_null_literal> NO incluye la palabra "None" */
|
||||
|
||||
<loop_stmt> ::= "startLoop(" <identifier> "," <expression> "," <expression> ")" <EOL>
|
||||
<block>
|
||||
|
|
@ -261,59 +297,116 @@ AVAP utiliza `avapConnector("TOKEN")` para la hidratación segura de credenciale
|
|||
|
||||
---
|
||||
|
||||
## SECCIÓN VI: Utilidades, Criptografía y Manipulación de Datos
|
||||
# SECCIÓN VI: Utilidades, Criptografía y Manipulación de Datos
|
||||
|
||||
AVAP incluye un set de comandos integrados de alto nivel para manipular tipos complejos (JSON y Listas), tiempos, textos y generar hashes.
|
||||
|
||||
### 6.1 Manipulación Nativa de Listas y Objetos JSON
|
||||
Para extraer y mutar estructuras complejas, AVAP provee comandos nativos específicos:
|
||||
* **`variableToList(elemento, destino)`**: Fuerza a que una variable escalar se convierta en una estructura iterable de lista.
|
||||
* **`itemFromList(lista_origen, indice, destino)`**: Extrae de forma segura el elemento contenido en la posición `indice` de una lista.
|
||||
* **`variableFromJSON(json_origen, clave, destino)`**: Parsea un objeto JSON en memoria y extrae el valor correspondiente a la `clave`.
|
||||
* **`AddVariableToJSON(clave, valor, json_destino)`**: Inyecta dinámicamente una nueva propiedad dentro de un objeto JSON existente.
|
||||
---
|
||||
|
||||
### 6.2 Criptografía y Expresiones Regulares
|
||||
* **`encodeSHA256` y `encodeMD5(origen, destino)`**: Funciones criptográficas que encriptan de forma irreversible un texto. Vitales para el almacenamiento seguro de contraseñas.
|
||||
* **`getRegex(origen, patron, destino)`**: Aplica una Expresión Regular (`patron`) sobre la variable de origen, extrayendo las coincidencias exactas.
|
||||
## 6.1 Manipulación Nativa de Listas y Objetos JSON
|
||||
|
||||
### 6.3 Transformación de Tiempo y Cadenas
|
||||
* **Fechas:** `getTimeStamp` (convierte un string a Epoch), `getDateTime` (Epoch a string legible), y `stampToDatetime` (Epoch a objeto datetime estructurado). Soportan formatos de calendario y cálculos con TimeDeltas.
|
||||
* **Cadenas:** `replace` (saneamiento y sustitución de texto) y `randomString` (generación determinista de claves/tokens aleatorios).
|
||||
Para extraer y mutar estructuras complejas, AVAP provee comandos nativos específicos. En AVAP, las listas **no se instancian con literales de array**, sino que se construyen y recorren a través de un conjunto cerrado de comandos especializados:
|
||||
|
||||
### Especificación BNF (Sección VI)
|
||||
* **`variableToList(elemento, destino)`**: Fuerza a que una variable escalar se convierta en una estructura iterable de lista de un único elemento. Es el punto de entrada canónico para construir una lista desde cero a partir de un valor existente.
|
||||
|
||||
* **`itemFromList(lista_origen, indice, destino)`**: Extrae de forma segura el elemento contenido en la posición `indice` (base 0) de una lista. Equivale a un acceso por índice controlado.
|
||||
|
||||
* **`getListLen(lista, destino)`**: Calcula el número total de elementos contenidos en `lista` y almacena el resultado entero en `destino`. Imprescindible para construir bucles de recorrido seguro y para validar listas antes de acceder a sus índices. Se recomienda llamar siempre a `getListLen` antes de `itemFromList` para evitar accesos fuera de rango.
|
||||
|
||||
* **`variableFromJSON(json_origen, clave, destino)`**: Parsea un objeto JSON en memoria y extrae el valor correspondiente a la `clave`, almacenándolo en `destino`. El acceso es directo por nombre de propiedad.
|
||||
|
||||
* **`AddVariableToJSON(clave, valor, json_destino)`**: Inyecta dinámicamente una nueva propiedad dentro de un objeto JSON existente. Si la clave ya existe, su valor es sobreescrito.
|
||||
|
||||
**Patrón de recorrido típico en AVAP:**
|
||||
|
||||
```avap
|
||||
// 1. Obtener longitud de la lista
|
||||
getListLen(myList, len)
|
||||
|
||||
// 2. Iterar con índice controlado
|
||||
i = 0
|
||||
while (i < len) {
|
||||
itemFromList(myList, i, currentItem)
|
||||
// ... procesar currentItem ...
|
||||
i = i + 1
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 6.2 Criptografía y Expresiones Regulares
|
||||
|
||||
* **`encodeSHA256(origen, destino)`** y **`encodeMD5(origen, destino)`**: Funciones criptográficas que encriptan de forma irreversible un texto. Vitales para el almacenamiento seguro de contraseñas y la verificación de integridad de datos. SHA-256 produce un digest de 64 caracteres hexadecimales y ofrece mayor resistencia criptográfica que MD5 (32 caracteres); se recomienda SHA-256 para nuevos desarrollos.
|
||||
|
||||
* **`getRegex(origen, patron, destino)`**: Aplica una Expresión Regular (`patron`) sobre la variable de origen, extrayendo la primera coincidencia exacta encontrada. El patrón sigue la sintaxis estándar compatible con Python `re`.
|
||||
|
||||
---
|
||||
|
||||
## 6.3 Transformación de Tiempo y Cadenas
|
||||
|
||||
### Fechas y Timestamps
|
||||
|
||||
AVAP provee tres comandos complementarios para cubrir todas las conversiones posibles entre representaciones de tiempo. Los tres soportan formatos de calendario en notación `strftime` de Python y cálculos con `TimeDelta` expresados en segundos (positivo para sumar, negativo para restar):
|
||||
|
||||
| Comando | Entrada | Salida |
|
||||
|---|---|---|
|
||||
| `getTimeStamp(fecha_string, formato, timedelta, destino)` | String de fecha | Epoch (entero) |
|
||||
| `stampToDatetime(epoch, formato, timedelta, destino)` | Epoch (entero) | String de fecha |
|
||||
| `getDateTime(formato, timedelta, zona_horaria, destino)` | — (ahora mismo) | String de fecha |
|
||||
|
||||
* **`getTimeStamp(fecha_string, formato, timedelta, destino)`**: Convierte un string de fecha legible a su valor Epoch (entero Unix). Útil para almacenar fechas y realizar cálculos aritméticos sobre ellas.
|
||||
|
||||
* **`stampToDatetime(epoch, formato, timedelta, destino)`**: Convierte un valor Epoch a un string de fecha con el formato especificado. Útil para presentar timestamps almacenados de forma legible.
|
||||
|
||||
* **`getDateTime(formato, timedelta, zona_horaria, destino)`**: Captura la fecha y hora actuales del sistema, aplica el ajuste `timedelta` y las convierte a la `zona_horaria` indicada antes de almacenar el resultado. Acepta cualquier zona horaria reconocida por la librería `pytz` de Python.
|
||||
|
||||
### Cadenas de Texto
|
||||
|
||||
* **`randomString(patron, longitud, destino)`**: Genera una cadena aleatoria de `longitud` caracteres cuyos símbolos están restringidos al conjunto definido por `patron` (expresión regular de caracteres). Útil para generar tokens de sesión, contraseñas temporales o identificadores únicos.
|
||||
|
||||
* **`replace(origen, patron_busqueda, reemplazo, destino)`**: Localiza todas las ocurrencias de `patron_busqueda` dentro de `origen` y las sustituye por `reemplazo`, almacenando el resultado en `destino`. Facilita el saneamiento y normalización de datos de entrada antes de su procesamiento o almacenamiento.
|
||||
|
||||
---
|
||||
|
||||
## BNF — Gramática Formal de los Comandos de Utilidad
|
||||
|
||||
```bnf
|
||||
/* [CORRECCIÓN] Todas las subreglas de <util_command> están ahora completamente expandidas. */
|
||||
<util_command> ::= <json_list_cmd> | <crypto_cmd> | <regex_cmd> | <datetime_cmd> | <stamp_cmd> | <string_cmd> | <replace_cmd>
|
||||
<util_command> ::= <json_list_cmd> | <crypto_cmd> | <regex_cmd>
|
||||
| <datetime_cmd> | <stamp_cmd> | <string_cmd> | <replace_cmd>
|
||||
|
||||
/* Manipulación de listas y JSON */
|
||||
<json_list_cmd> ::= "variableToList(" <expression> "," <identifier> ")"
|
||||
| "itemFromList(" <identifier> "," <expression> "," <identifier> ")"
|
||||
| "getListLen(" <identifier> "," <identifier> ")"
|
||||
| "variableFromJSON(" <identifier> "," <expression> "," <identifier> ")"
|
||||
| "AddVariableToJSON(" <expression> "," <expression> "," <identifier> ")"
|
||||
|
||||
/* Criptografía */
|
||||
<crypto_cmd> ::= "encodeSHA256(" <identifier_or_string> "," <identifier> ")"
|
||||
| "encodeMD5(" <identifier_or_string> "," <identifier> ")"
|
||||
<crypto_cmd> ::= "encodeSHA256(" <expression> "," <identifier> ")"
|
||||
| "encodeMD5(" <expression> "," <identifier> ")"
|
||||
|
||||
/* Expresiones regulares */
|
||||
<regex_cmd> ::= "getRegex(" <identifier> "," <stringliteral> "," <identifier> ")"
|
||||
<regex_cmd> ::= "getRegex(" <identifier> "," <expression> "," <identifier> ")"
|
||||
|
||||
/* Fecha/hora actual → string */
|
||||
<datetime_cmd> ::= "getDateTime(" <stringliteral> "," <expression> "," <stringliteral> "," <identifier> ")"
|
||||
/* Argumentos: formato_salida, epoch_origen, zona_horaria, destino */
|
||||
/* Argumentos: formato_salida, timedelta, zona_horaria, destino */
|
||||
|
||||
/* Conversiones epoch ↔ string */
|
||||
<stamp_cmd> ::= "stampToDatetime(" <expression> "," <stringliteral> "," <expression> "," <identifier> ")"
|
||||
/* Argumentos: epoch_origen, formato, timedelta, destino */
|
||||
| "getTimeStamp(" <stringliteral> "," <stringliteral> "," <expression> "," <identifier> ")"
|
||||
/* Argumentos: fecha_string, formato_entrada, timedelta, destino */
|
||||
|
||||
<string_cmd> ::= "randomString(" <expression> "," <identifier> ")"
|
||||
/* Argumentos: longitud, destino */
|
||||
/* Cadenas */
|
||||
<string_cmd> ::= "randomString(" <expression> "," <expression> "," <identifier> ")"
|
||||
/* Argumentos: patron, longitud, destino */
|
||||
|
||||
<replace_cmd> ::= "replace(" <identifier_or_string> "," <stringliteral> "," <stringliteral> "," <identifier> ")"
|
||||
<replace_cmd> ::= "replace(" <identifier> "," <stringliteral> "," <stringliteral> "," <identifier> ")"
|
||||
/* Argumentos: origen, patron_busqueda, reemplazo, destino */
|
||||
```
|
||||
|
||||
|
||||
---
|
||||
|
||||
## SECCIÓN VII: Arquitectura de Funciones y Ámbitos (Scopes)
|
||||
|
|
|
|||
3191
docs/avap.txt
3191
docs/avap.txt
File diff suppressed because it is too large
Load Diff
File diff suppressed because one or more lines are too long
|
|
@ -5,7 +5,6 @@ description = "Add your description here"
|
|||
readme = "README.md"
|
||||
requires-python = ">=3.11"
|
||||
dependencies = [
|
||||
"chonkie[semantic]>=1.5.6",
|
||||
"grpcio>=1.78.0",
|
||||
"grpcio-reflection>=1.78.0",
|
||||
"grpcio-tools>=1.78.0",
|
||||
|
|
@ -28,7 +27,9 @@ dependencies = [
|
|||
dev = [
|
||||
"beir>=2.2.0",
|
||||
"boto3>=1.42.58",
|
||||
"chonkie[elastic,semantic]>=1.6.0",
|
||||
"evidently>=0.7.20",
|
||||
"flatbuffers>=25.12.19",
|
||||
"jupyter>=1.1.1",
|
||||
"langfuse<3",
|
||||
"litellm>=1.82.0",
|
||||
|
|
|
|||
|
|
@ -1,30 +1,29 @@
|
|||
import re
|
||||
import hashlib
|
||||
from typing import Any
|
||||
from enum import Enum
|
||||
import typer
|
||||
import logging
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
from loguru import logger
|
||||
from elasticsearch import Elasticsearch
|
||||
from langchain_core.documents import Document
|
||||
from langchain_elasticsearch import ElasticsearchStore
|
||||
from langchain_community.embeddings import HuggingFaceEmbeddings
|
||||
from langchain_experimental.text_splitter import SemanticChunker
|
||||
from chonkie import SemanticChunker, MarkdownChef
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
from src.utils.emb_factory import create_embedding_model
|
||||
from scripts.pipelines.tasks.chunk import scrape_avap_docs
|
||||
from scripts.pipelines.tasks.chunk import (
|
||||
read_files,
|
||||
get_chunk_docs,
|
||||
convert_chunks_to_document
|
||||
)
|
||||
|
||||
app = typer.Typer()
|
||||
|
||||
ELASTICSEARCH_LOCAL_URL = os.getenv("ELASTICSEARCH_LOCAL_URL")
|
||||
OLLAMA_LOCAL_URL = os.getenv("OLLAMA_LOCAL_URL")
|
||||
ELASTICSEARCH_INDEX = os.getenv("ELASTICSEARCH_INDEX")
|
||||
OLLAMA_URL = os.getenv("OLLAMA_URL")
|
||||
OLLAMA_EMB_MODEL_NAME = os.getenv("OLLAMA_EMB_MODEL_NAME")
|
||||
AVAP_WEB_DOCS_URL = os.getenv("AVAP_WEB_DOCS_URL")
|
||||
HF_EMB_MODEL_NAME = os.getenv("HF_EMB_MODEL_NAME")
|
||||
|
||||
class DistanceStrategy(str, Enum):
|
||||
euclidean = "EUCLIDEAN_DISTANCE"
|
||||
|
|
@ -33,55 +32,45 @@ class DistanceStrategy(str, Enum):
|
|||
jaccard = "JACCARD"
|
||||
cosine = "COSINE"
|
||||
|
||||
def clean_text(text: str) -> str:
|
||||
text = text.replace("\u00a0", " ")
|
||||
text = re.sub(r"\s+", " ", text).strip()
|
||||
return text
|
||||
|
||||
def build_documents_from_folder(
|
||||
folder_path: str,
|
||||
) -> list[Document]:
|
||||
|
||||
folder = Path(folder_path)
|
||||
|
||||
if not folder.exists() or not folder.is_dir():
|
||||
raise ValueError(f"Invalid folder path: {folder_path}")
|
||||
|
||||
all_documents: list[Document] = []
|
||||
|
||||
for file_path in folder.glob("*.txt"):
|
||||
doc_text = file_path.read_text(encoding="utf-8")
|
||||
|
||||
if not doc_text.strip():
|
||||
continue
|
||||
|
||||
metadata: dict[str, Any] = {
|
||||
"source": file_path.name,
|
||||
}
|
||||
|
||||
doc_text = clean_text(doc_text)
|
||||
document = Document(
|
||||
id=hashlib.md5(file_path.name.encode()).hexdigest(),
|
||||
page_content=doc_text,
|
||||
metadata={**metadata}
|
||||
)
|
||||
|
||||
all_documents.append(document)
|
||||
|
||||
return all_documents
|
||||
|
||||
|
||||
@app.command()
|
||||
def elasticsearch_ingestion(
|
||||
docs_folder_path: str = "ingestion/docs",
|
||||
docs_folder_path: str = "docs",
|
||||
es_index: str = "avap-docs-test-v2",
|
||||
es_request_timeout: int = 120,
|
||||
es_max_retries: int = 5,
|
||||
es_retry_on_timeout: bool = True,
|
||||
distance_strategy: DistanceStrategy = DistanceStrategy.cosine,
|
||||
chunk_size: int = 2048,
|
||||
chunk_threshold: float = 0.5,
|
||||
chunk_similarity_window: int = 3,
|
||||
chunk_skip_window: int = 1,
|
||||
):
|
||||
logger.info("Starting Elasticsearch ingestion pipeline...")
|
||||
logger.info(f"Using docs folder path: {docs_folder_path}")
|
||||
documents = build_documents_from_folder(folder_path=docs_folder_path)
|
||||
logger.info(f"Reading files from folder: {docs_folder_path}/LRM and {docs_folder_path}/samples...")
|
||||
avap_code_docs = read_files(f"{docs_folder_path}/samples")
|
||||
avap_language_docs = read_files(f"{docs_folder_path}/LRM")
|
||||
|
||||
logger.info("Instantiating semantic chunker and chef...")
|
||||
custom_tokenizer = AutoTokenizer.from_pretrained(HF_EMB_MODEL_NAME)
|
||||
chef = MarkdownChef(tokenizer=custom_tokenizer)
|
||||
chunker = SemanticChunker(
|
||||
embedding_model=HF_EMB_MODEL_NAME,
|
||||
chunk_size=chunk_size,
|
||||
threshold=chunk_threshold,
|
||||
similarity_window=chunk_similarity_window,
|
||||
skip_window=chunk_skip_window
|
||||
)
|
||||
logger.info("Processing Markdown docs with chef...")
|
||||
doc = chef.process(f"{docs_folder_path}/LRM/avap.md")
|
||||
|
||||
logger.info("Chunking AVAP Language docs...")
|
||||
avap_language_docs_chunks = get_chunk_docs(avap_language_docs, chunker)
|
||||
|
||||
logger.info("Creating Langchain Document to index...")
|
||||
avap_language_langchain_docs = convert_chunks_to_document(avap_language_docs_chunks)
|
||||
avap_code_langchain_docs = convert_chunks_to_document(avap_code_docs)
|
||||
avap_documents = avap_language_langchain_docs + avap_code_langchain_docs
|
||||
|
||||
logger.info("Connecting to Elasticsearch...")
|
||||
try:
|
||||
|
|
@ -106,15 +95,19 @@ def elasticsearch_ingestion(
|
|||
logger.exception("Failed to instantiate embeddings model.")
|
||||
raise
|
||||
|
||||
logger.info(f"Uploading documents to index {ELASTICSEARCH_INDEX}...")
|
||||
logger.info(f"Checking if index {es_index} exists and deleting if it does...")
|
||||
if es.indices.exists(index=es_index):
|
||||
es.indices.delete(index=es_index)
|
||||
|
||||
logger.info(f"Uploading documents to index {es_index}...")
|
||||
ElasticsearchStore.from_documents(
|
||||
documents,
|
||||
avap_documents,
|
||||
embeddings,
|
||||
client=es,
|
||||
index_name=ELASTICSEARCH_INDEX,
|
||||
index_name=es_index,
|
||||
distance_strategy=distance_strategy.value,
|
||||
)
|
||||
logger.info(f"Finished uploading documents to index {ELASTICSEARCH_INDEX}.")
|
||||
logger.info(f"Finished uploading documents to index {es_index}.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
|
|||
|
|
@ -0,0 +1,122 @@
|
|||
import typer
|
||||
import logging
|
||||
|
||||
from loguru import logger
|
||||
from elasticsearch import Elasticsearch
|
||||
from chonkie import MarkdownChef, FileFetcher, ElasticHandshake
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
from src.config import settings
|
||||
from scripts.pipelines.tasks.embeddings import OllamaEmbeddings
|
||||
from scripts.pipelines.tasks.chunk import merge_markdown_document
|
||||
|
||||
app = typer.Typer()
|
||||
|
||||
def get_processing_and_chunking_config(docs_extension: str, chunk_size: int,
|
||||
chunk_threshold: float | None,
|
||||
chunk_similarity_window: int| None,
|
||||
chunk_skip_window: int | None) -> tuple[str, dict, str, dict]:
|
||||
"""
|
||||
Check the file extension and return the appropriate processing and chunking strategies and their kwargs.
|
||||
|
||||
Args:
|
||||
docs_extension (str): The file extension of the documents to be ingested.
|
||||
chunk_size (int): The size of the chunks to be created.
|
||||
chunk_threshold (float, optional): The threshold for semantic chunking. Required if docs_extension is .md.
|
||||
chunk_similarity_window (int, optional): The similarity window for semantic chunking
|
||||
chunk_skip_window (int, optional): The skip window for semantic chunking.
|
||||
|
||||
Returns:
|
||||
tuple[str, dict, str, dict]: A tuple containing the processing strategy, its kwargs, the chunking strategy, and its kwargs.
|
||||
"""
|
||||
if docs_extension == ".md":
|
||||
process_type = "markdown"
|
||||
custom_tokenizer = AutoTokenizer.from_pretrained(settings.hf_emb_model_name)
|
||||
process_kwargs = {"tokenizer": custom_tokenizer}
|
||||
# process_type = "text"
|
||||
# process_kwargs = {}
|
||||
chunk_strat = "semantic"
|
||||
chunk_kwargs = {"embedding_model": settings.hf_emb_model_name, "threshold": chunk_threshold, "chunk_size": chunk_size,
|
||||
"similarity_window": chunk_similarity_window, "skip_window": chunk_skip_window}
|
||||
|
||||
elif docs_extension == ".avap":
|
||||
process_type = "text"
|
||||
process_kwargs = {}
|
||||
chunk_strat = "recursive" # Once we have the BNF and uploaded to tree-sitter, we can use code (?)
|
||||
chunk_kwargs = {"chunk_size": chunk_size}
|
||||
|
||||
return process_type, process_kwargs, chunk_strat, chunk_kwargs
|
||||
|
||||
|
||||
@app.command()
|
||||
def elasticsearch_ingestion(
|
||||
docs_folder_path: str = "docs/LRM",
|
||||
docs_extension: str = ".md",
|
||||
es_index: str = "avap-docs-test-v3",
|
||||
es_request_timeout: int = 120,
|
||||
es_max_retries: int = 5,
|
||||
es_retry_on_timeout: bool = True,
|
||||
delete_es_index: bool = True,
|
||||
chunk_size: int = 2048,
|
||||
chunk_threshold: float | None = 0.5,
|
||||
chunk_similarity_window: int | None = 3,
|
||||
chunk_skip_window: int | None = 1
|
||||
):
|
||||
custom_tokenizer = AutoTokenizer.from_pretrained(settings.hf_emb_model_name)
|
||||
processed_docs = []
|
||||
fused_docs = []
|
||||
logger.info(f"Instantiating Elasticsearch client with URL: {settings.elasticsearch_local_url}...")
|
||||
es = Elasticsearch(
|
||||
hosts=settings.elasticsearch_local_url,
|
||||
request_timeout=es_request_timeout,
|
||||
max_retries=es_max_retries,
|
||||
retry_on_timeout=es_retry_on_timeout,
|
||||
)
|
||||
|
||||
if delete_es_index and es.indices.exists(index=es_index):
|
||||
logger.info(f"Deleting existing Elasticsearch index: {es_index}...")
|
||||
es.indices.delete(index=es_index)
|
||||
|
||||
logger.info("Starting Elasticsearch ingestion pipeline...")
|
||||
(process_type,
|
||||
process_kwargs,
|
||||
chunk_strat,
|
||||
chunk_kwargs) = get_processing_and_chunking_config(docs_extension, chunk_size, chunk_threshold, chunk_similarity_window, chunk_skip_window)
|
||||
|
||||
logger.info(f"Fetching files from {docs_folder_path}...")
|
||||
fetcher = FileFetcher()
|
||||
docs = fetcher.fetch(dir=f"{settings.proj_root}/{docs_folder_path}")
|
||||
|
||||
logger.info(f"Processing documents with process_type: {process_type}...")
|
||||
chef = MarkdownChef(tokenizer=custom_tokenizer)
|
||||
for doc in docs:
|
||||
processed_doc = chef.process(doc)
|
||||
processed_docs.append(processed_doc)
|
||||
|
||||
logger.info(f"Chunking documents with chunk_strat: {chunk_strat}...")
|
||||
for processed_doc in processed_docs:
|
||||
fused_doc = merge_markdown_document(processed_doc)
|
||||
fused_docs.append(fused_doc)
|
||||
|
||||
logger.info(f"Ingesting chunks in Elasticsearch index: {es_index}...")
|
||||
handshake = ElasticHandshake(
|
||||
client=es,
|
||||
index_name=es_index,
|
||||
embedding_model=OllamaEmbeddings(model=settings.ollama_emb_model_name)
|
||||
)
|
||||
for fused_doc in fused_docs:
|
||||
handshake.write(fused_doc.chunks)
|
||||
|
||||
logger.info(f"Finished ingesting in {es_index}.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
|
||||
)
|
||||
try:
|
||||
app()
|
||||
except Exception as exc:
|
||||
logger.exception(exc)
|
||||
raise
|
||||
|
|
@ -32,12 +32,7 @@
|
|||
"\n",
|
||||
"from src.utils.llm_factory import create_chat_model\n",
|
||||
"from src.utils.emb_factory import create_embedding_model\n",
|
||||
"from src.config import (\n",
|
||||
" ELASTICSEARCH_LOCAL_URL,\n",
|
||||
" ELASTICSEARCH_INDEX,\n",
|
||||
" OLLAMA_MODEL_NAME,\n",
|
||||
" OLLAMA_EMB_MODEL_NAME\n",
|
||||
")"
|
||||
"from src.config import settings"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
|
@ -51,20 +46,20 @@
|
|||
"\n",
|
||||
"llm = create_chat_model(\n",
|
||||
" provider=\"ollama\",\n",
|
||||
" model=OLLAMA_MODEL_NAME,\n",
|
||||
" model=settings.ollama_model_name,\n",
|
||||
" temperature=0.5,\n",
|
||||
" validate_model_on_init=True,\n",
|
||||
")\n",
|
||||
"embeddings = create_embedding_model(\n",
|
||||
" provider=\"ollama\",\n",
|
||||
" model=OLLAMA_EMB_MODEL_NAME,\n",
|
||||
" model=settings.ollama_emb_model_name,\n",
|
||||
")\n",
|
||||
"vector_store = ElasticsearchStore(\n",
|
||||
" es_url=ELASTICSEARCH_LOCAL_URL,\n",
|
||||
" index_name=ELASTICSEARCH_INDEX,\n",
|
||||
" es_url=settings.elasticsearch_local_url,\n",
|
||||
" index_name=\"avap-docs-test-v3\",\n",
|
||||
" embedding=embeddings,\n",
|
||||
" query_field=\"text\",\n",
|
||||
" vector_query_field=\"vector\",\n",
|
||||
" vector_query_field=\"embedding\",\n",
|
||||
" # strategy=ElasticsearchStore.ApproxRetrievalStrategy(\n",
|
||||
" # hybrid=True,\n",
|
||||
" # rrf={\"rank_constant\": 60, \"window_size\": 100}\n",
|
||||
|
|
@ -464,44 +459,185 @@
|
|||
"text": [
|
||||
"================================\u001b[1m Human Message \u001b[0m=================================\n",
|
||||
"\n",
|
||||
"What types of includes does AVAP have?\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"ename": "ResponseError",
|
||||
"evalue": "failed to parse JSON: unexpected end of JSON input (status code: -1)",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
|
||||
"\u001b[31mResponseError\u001b[39m Traceback (most recent call last)",
|
||||
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[18]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m a = \u001b[43mstream_graph_updates\u001b[49m\u001b[43m(\u001b[49m\u001b[43muser_input\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43magentic_graph\u001b[49m\u001b[43m)\u001b[49m\n",
|
||||
"\u001b[36mFile \u001b[39m\u001b[32m~/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/langfuse/decorators/langfuse_decorator.py:256\u001b[39m, in \u001b[36mLangfuseDecorator._sync_observe.<locals>.sync_wrapper\u001b[39m\u001b[34m(*args, **kwargs)\u001b[39m\n\u001b[32m 254\u001b[39m result = func(*args, **kwargs)\n\u001b[32m 255\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[32m--> \u001b[39m\u001b[32m256\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_handle_exception\u001b[49m\u001b[43m(\u001b[49m\u001b[43mobservation\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43me\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 257\u001b[39m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[32m 258\u001b[39m result = \u001b[38;5;28mself\u001b[39m._finalize_call(\n\u001b[32m 259\u001b[39m observation, result, capture_output, transform_to_string\n\u001b[32m 260\u001b[39m )\n",
|
||||
"\u001b[36mFile \u001b[39m\u001b[32m~/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/langfuse/decorators/langfuse_decorator.py:520\u001b[39m, in \u001b[36mLangfuseDecorator._handle_exception\u001b[39m\u001b[34m(self, observation, e)\u001b[39m\n\u001b[32m 516\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m observation:\n\u001b[32m 517\u001b[39m _observation_params_context.get()[observation.id].update(\n\u001b[32m 518\u001b[39m level=\u001b[33m\"\u001b[39m\u001b[33mERROR\u001b[39m\u001b[33m\"\u001b[39m, status_message=\u001b[38;5;28mstr\u001b[39m(e)\n\u001b[32m 519\u001b[39m )\n\u001b[32m--> \u001b[39m\u001b[32m520\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m e\n",
|
||||
"\u001b[36mFile \u001b[39m\u001b[32m~/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/langfuse/decorators/langfuse_decorator.py:254\u001b[39m, in \u001b[36mLangfuseDecorator._sync_observe.<locals>.sync_wrapper\u001b[39m\u001b[34m(*args, **kwargs)\u001b[39m\n\u001b[32m 251\u001b[39m result = \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[32m 253\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m254\u001b[39m result = \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 255\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[32m 256\u001b[39m \u001b[38;5;28mself\u001b[39m._handle_exception(observation, e)\n",
|
||||
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[15]\u001b[39m\u001b[32m, line 9\u001b[39m, in \u001b[36mstream_graph_updates\u001b[39m\u001b[34m(user_input, graph)\u001b[39m\n\u001b[32m 1\u001b[39m \u001b[38;5;129m@observe\u001b[39m(name=\u001b[33m\"\u001b[39m\u001b[33mgraph_run\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m 2\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mstream_graph_updates\u001b[39m(user_input: \u001b[38;5;28mstr\u001b[39m, graph: StateGraph):\n\u001b[32m 3\u001b[39m langfuse_context.update_current_trace(\n\u001b[32m 4\u001b[39m user_id=\u001b[33m\"\u001b[39m\u001b[33malberto\u001b[39m\u001b[33m\"\u001b[39m,\n\u001b[32m 5\u001b[39m tags=[\u001b[33m\"\u001b[39m\u001b[33mavap\u001b[39m\u001b[33m\"\u001b[39m, \u001b[33m\"\u001b[39m\u001b[33mrag\u001b[39m\u001b[33m\"\u001b[39m, \u001b[33m\"\u001b[39m\u001b[33mlanggraph\u001b[39m\u001b[33m\"\u001b[39m],\n\u001b[32m 6\u001b[39m metadata={\u001b[33m\"\u001b[39m\u001b[33mfeature\u001b[39m\u001b[33m\"\u001b[39m: \u001b[33m\"\u001b[39m\u001b[33magentic-rag\u001b[39m\u001b[33m\"\u001b[39m},\n\u001b[32m 7\u001b[39m )\n\u001b[32m----> \u001b[39m\u001b[32m9\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mevent\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mgraph\u001b[49m\u001b[43m.\u001b[49m\u001b[43mstream\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 10\u001b[39m \u001b[43m \u001b[49m\u001b[43m{\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mmessages\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43m[\u001b[49m\u001b[43m{\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mrole\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43muser\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mcontent\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43muser_input\u001b[49m\u001b[43m}\u001b[49m\u001b[43m]\u001b[49m\u001b[43m}\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 11\u001b[39m \u001b[43m \u001b[49m\u001b[43mstream_mode\u001b[49m\u001b[43m=\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mvalues\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m 12\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\u001b[43m:\u001b[49m\n\u001b[32m 13\u001b[39m \u001b[43m \u001b[49m\u001b[43mevent\u001b[49m\u001b[43m[\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mmessages\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m[\u001b[49m\u001b[43m-\u001b[49m\u001b[32;43m1\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m.\u001b[49m\u001b[43mpretty_print\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 15\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m event[\u001b[33m\"\u001b[39m\u001b[33mmessages\u001b[39m\u001b[33m\"\u001b[39m][-\u001b[32m1\u001b[39m]\n",
|
||||
"\u001b[36mFile \u001b[39m\u001b[32m~/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/langgraph/pregel/main.py:2646\u001b[39m, in \u001b[36mPregel.stream\u001b[39m\u001b[34m(self, input, config, context, stream_mode, print_mode, output_keys, interrupt_before, interrupt_after, durability, subgraphs, debug, **kwargs)\u001b[39m\n\u001b[32m 2644\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m task \u001b[38;5;129;01min\u001b[39;00m loop.match_cached_writes():\n\u001b[32m 2645\u001b[39m loop.output_writes(task.id, task.writes, cached=\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[32m-> \u001b[39m\u001b[32m2646\u001b[39m \u001b[43m\u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43m_\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mrunner\u001b[49m\u001b[43m.\u001b[49m\u001b[43mtick\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 2647\u001b[39m \u001b[43m \u001b[49m\u001b[43m[\u001b[49m\u001b[43mt\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mt\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mloop\u001b[49m\u001b[43m.\u001b[49m\u001b[43mtasks\u001b[49m\u001b[43m.\u001b[49m\u001b[43mvalues\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mif\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;129;43;01mnot\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mt\u001b[49m\u001b[43m.\u001b[49m\u001b[43mwrites\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 2648\u001b[39m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mstep_timeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 2649\u001b[39m \u001b[43m \u001b[49m\u001b[43mget_waiter\u001b[49m\u001b[43m=\u001b[49m\u001b[43mget_waiter\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 2650\u001b[39m \u001b[43m \u001b[49m\u001b[43mschedule_task\u001b[49m\u001b[43m=\u001b[49m\u001b[43mloop\u001b[49m\u001b[43m.\u001b[49m\u001b[43maccept_push\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 2651\u001b[39m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\u001b[43m:\u001b[49m\n\u001b[32m 2652\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;66;43;03m# emit output\u001b[39;49;00m\n\u001b[32m 2653\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;28;43;01myield from\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43m_output\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 2654\u001b[39m \u001b[43m \u001b[49m\u001b[43mstream_mode\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mprint_mode\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msubgraphs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstream\u001b[49m\u001b[43m.\u001b[49m\u001b[43mget\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mqueue\u001b[49m\u001b[43m.\u001b[49m\u001b[43mEmpty\u001b[49m\n\u001b[32m 2655\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 2656\u001b[39m loop.after_tick()\n",
|
||||
"\u001b[36mFile \u001b[39m\u001b[32m~/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/langgraph/pregel/_runner.py:167\u001b[39m, in \u001b[36mPregelRunner.tick\u001b[39m\u001b[34m(self, tasks, reraise, timeout, retry_policy, get_waiter, schedule_task)\u001b[39m\n\u001b[32m 165\u001b[39m t = tasks[\u001b[32m0\u001b[39m]\n\u001b[32m 166\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m167\u001b[39m \u001b[43mrun_with_retry\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 168\u001b[39m \u001b[43m \u001b[49m\u001b[43mt\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 169\u001b[39m \u001b[43m \u001b[49m\u001b[43mretry_policy\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 170\u001b[39m \u001b[43m \u001b[49m\u001b[43mconfigurable\u001b[49m\u001b[43m=\u001b[49m\u001b[43m{\u001b[49m\n\u001b[32m 171\u001b[39m \u001b[43m \u001b[49m\u001b[43mCONFIG_KEY_CALL\u001b[49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mpartial\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 172\u001b[39m \u001b[43m \u001b[49m\u001b[43m_call\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 173\u001b[39m \u001b[43m \u001b[49m\u001b[43mweakref\u001b[49m\u001b[43m.\u001b[49m\u001b[43mref\u001b[49m\u001b[43m(\u001b[49m\u001b[43mt\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 174\u001b[39m \u001b[43m \u001b[49m\u001b[43mretry_policy\u001b[49m\u001b[43m=\u001b[49m\u001b[43mretry_policy\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 175\u001b[39m \u001b[43m \u001b[49m\u001b[43mfutures\u001b[49m\u001b[43m=\u001b[49m\u001b[43mweakref\u001b[49m\u001b[43m.\u001b[49m\u001b[43mref\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfutures\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 176\u001b[39m \u001b[43m \u001b[49m\u001b[43mschedule_task\u001b[49m\u001b[43m=\u001b[49m\u001b[43mschedule_task\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 177\u001b[39m \u001b[43m \u001b[49m\u001b[43msubmit\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43msubmit\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 178\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 179\u001b[39m \u001b[43m \u001b[49m\u001b[43m}\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 180\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 181\u001b[39m \u001b[38;5;28mself\u001b[39m.commit(t, \u001b[38;5;28;01mNone\u001b[39;00m)\n\u001b[32m 182\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m exc:\n",
|
||||
"\u001b[36mFile \u001b[39m\u001b[32m~/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/langgraph/pregel/_retry.py:42\u001b[39m, in \u001b[36mrun_with_retry\u001b[39m\u001b[34m(task, retry_policy, configurable)\u001b[39m\n\u001b[32m 40\u001b[39m task.writes.clear()\n\u001b[32m 41\u001b[39m \u001b[38;5;66;03m# run the task\u001b[39;00m\n\u001b[32m---> \u001b[39m\u001b[32m42\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mtask\u001b[49m\u001b[43m.\u001b[49m\u001b[43mproc\u001b[49m\u001b[43m.\u001b[49m\u001b[43minvoke\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtask\u001b[49m\u001b[43m.\u001b[49m\u001b[43minput\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mconfig\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 43\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m ParentCommand \u001b[38;5;28;01mas\u001b[39;00m exc:\n\u001b[32m 44\u001b[39m ns: \u001b[38;5;28mstr\u001b[39m = config[CONF][CONFIG_KEY_CHECKPOINT_NS]\n",
|
||||
"\u001b[36mFile \u001b[39m\u001b[32m~/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/langgraph/_internal/_runnable.py:656\u001b[39m, in \u001b[36mRunnableSeq.invoke\u001b[39m\u001b[34m(self, input, config, **kwargs)\u001b[39m\n\u001b[32m 654\u001b[39m \u001b[38;5;66;03m# run in context\u001b[39;00m\n\u001b[32m 655\u001b[39m \u001b[38;5;28;01mwith\u001b[39;00m set_config_context(config, run) \u001b[38;5;28;01mas\u001b[39;00m context:\n\u001b[32m--> \u001b[39m\u001b[32m656\u001b[39m \u001b[38;5;28minput\u001b[39m = \u001b[43mcontext\u001b[49m\u001b[43m.\u001b[49m\u001b[43mrun\u001b[49m\u001b[43m(\u001b[49m\u001b[43mstep\u001b[49m\u001b[43m.\u001b[49m\u001b[43minvoke\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mconfig\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 657\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m 658\u001b[39m \u001b[38;5;28minput\u001b[39m = step.invoke(\u001b[38;5;28minput\u001b[39m, config)\n",
|
||||
"\u001b[36mFile \u001b[39m\u001b[32m~/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/langgraph/_internal/_runnable.py:400\u001b[39m, in \u001b[36mRunnableCallable.invoke\u001b[39m\u001b[34m(self, input, config, **kwargs)\u001b[39m\n\u001b[32m 398\u001b[39m run_manager.on_chain_end(ret)\n\u001b[32m 399\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m400\u001b[39m ret = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 401\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m.recurse \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(ret, Runnable):\n\u001b[32m 402\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m ret.invoke(\u001b[38;5;28minput\u001b[39m, config)\n",
|
||||
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[10]\u001b[39m\u001b[32m, line 5\u001b[39m, in \u001b[36magent\u001b[39m\u001b[34m(state)\u001b[39m\n\u001b[32m 3\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34magent\u001b[39m(state: AgenticAgentState) -> AgenticAgentState:\n\u001b[32m 4\u001b[39m llm_with_tools = llm.bind_tools(tools)\n\u001b[32m----> \u001b[39m\u001b[32m5\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m {\u001b[33m\"\u001b[39m\u001b[33mmessages\u001b[39m\u001b[33m\"\u001b[39m: [\u001b[43mllm_with_tools\u001b[49m\u001b[43m.\u001b[49m\u001b[43minvoke\u001b[49m\u001b[43m(\u001b[49m\u001b[43m[\u001b[49m\u001b[43mSystemMessage\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcontent\u001b[49m\u001b[43m=\u001b[49m\u001b[43mAGENTIC_PROMPT\u001b[49m\u001b[43m.\u001b[49m\u001b[43mcontent\u001b[49m\u001b[43m)\u001b[49m\u001b[43m]\u001b[49m\u001b[43m \u001b[49m\u001b[43m+\u001b[49m\u001b[43m \u001b[49m\u001b[43mstate\u001b[49m\u001b[43m[\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mmessages\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m]}\n",
|
||||
"\u001b[36mFile \u001b[39m\u001b[32m~/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/langchain_core/runnables/base.py:5695\u001b[39m, in \u001b[36mRunnableBindingBase.invoke\u001b[39m\u001b[34m(self, input, config, **kwargs)\u001b[39m\n\u001b[32m 5688\u001b[39m \u001b[38;5;129m@override\u001b[39m\n\u001b[32m 5689\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34minvoke\u001b[39m(\n\u001b[32m 5690\u001b[39m \u001b[38;5;28mself\u001b[39m,\n\u001b[32m (...)\u001b[39m\u001b[32m 5693\u001b[39m **kwargs: Any | \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[32m 5694\u001b[39m ) -> Output:\n\u001b[32m-> \u001b[39m\u001b[32m5695\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mbound\u001b[49m\u001b[43m.\u001b[49m\u001b[43minvoke\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 5696\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m 5697\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_merge_configs\u001b[49m\u001b[43m(\u001b[49m\u001b[43mconfig\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 5698\u001b[39m \u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43m{\u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m}\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 5699\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n",
|
||||
"\u001b[36mFile \u001b[39m\u001b[32m~/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/langchain_core/language_models/chat_models.py:402\u001b[39m, in \u001b[36mBaseChatModel.invoke\u001b[39m\u001b[34m(self, input, config, stop, **kwargs)\u001b[39m\n\u001b[32m 388\u001b[39m \u001b[38;5;129m@override\u001b[39m\n\u001b[32m 389\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34minvoke\u001b[39m(\n\u001b[32m 390\u001b[39m \u001b[38;5;28mself\u001b[39m,\n\u001b[32m (...)\u001b[39m\u001b[32m 395\u001b[39m **kwargs: Any,\n\u001b[32m 396\u001b[39m ) -> AIMessage:\n\u001b[32m 397\u001b[39m config = ensure_config(config)\n\u001b[32m 398\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m cast(\n\u001b[32m 399\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mAIMessage\u001b[39m\u001b[33m\"\u001b[39m,\n\u001b[32m 400\u001b[39m cast(\n\u001b[32m 401\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mChatGeneration\u001b[39m\u001b[33m\"\u001b[39m,\n\u001b[32m--> \u001b[39m\u001b[32m402\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mgenerate_prompt\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 403\u001b[39m \u001b[43m \u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_convert_input\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 404\u001b[39m \u001b[43m \u001b[49m\u001b[43mstop\u001b[49m\u001b[43m=\u001b[49m\u001b[43mstop\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 405\u001b[39m \u001b[43m \u001b[49m\u001b[43mcallbacks\u001b[49m\u001b[43m=\u001b[49m\u001b[43mconfig\u001b[49m\u001b[43m.\u001b[49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mcallbacks\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 406\u001b[39m \u001b[43m \u001b[49m\u001b[43mtags\u001b[49m\u001b[43m=\u001b[49m\u001b[43mconfig\u001b[49m\u001b[43m.\u001b[49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mtags\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 407\u001b[39m \u001b[43m \u001b[49m\u001b[43mmetadata\u001b[49m\u001b[43m=\u001b[49m\u001b[43mconfig\u001b[49m\u001b[43m.\u001b[49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mmetadata\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 408\u001b[39m \u001b[43m \u001b[49m\u001b[43mrun_name\u001b[49m\u001b[43m=\u001b[49m\u001b[43mconfig\u001b[49m\u001b[43m.\u001b[49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mrun_name\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 409\u001b[39m \u001b[43m \u001b[49m\u001b[43mrun_id\u001b[49m\u001b[43m=\u001b[49m\u001b[43mconfig\u001b[49m\u001b[43m.\u001b[49m\u001b[43mpop\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mrun_id\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 410\u001b[39m \u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 411\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m.generations[\u001b[32m0\u001b[39m][\u001b[32m0\u001b[39m],\n\u001b[32m 412\u001b[39m ).message,\n\u001b[32m 413\u001b[39m )\n",
|
||||
"\u001b[36mFile \u001b[39m\u001b[32m~/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/langchain_core/language_models/chat_models.py:1123\u001b[39m, in \u001b[36mBaseChatModel.generate_prompt\u001b[39m\u001b[34m(self, prompts, stop, callbacks, **kwargs)\u001b[39m\n\u001b[32m 1114\u001b[39m \u001b[38;5;129m@override\u001b[39m\n\u001b[32m 1115\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mgenerate_prompt\u001b[39m(\n\u001b[32m 1116\u001b[39m \u001b[38;5;28mself\u001b[39m,\n\u001b[32m (...)\u001b[39m\u001b[32m 1120\u001b[39m **kwargs: Any,\n\u001b[32m 1121\u001b[39m ) -> LLMResult:\n\u001b[32m 1122\u001b[39m prompt_messages = [p.to_messages() \u001b[38;5;28;01mfor\u001b[39;00m p \u001b[38;5;129;01min\u001b[39;00m prompts]\n\u001b[32m-> \u001b[39m\u001b[32m1123\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mgenerate\u001b[49m\u001b[43m(\u001b[49m\u001b[43mprompt_messages\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstop\u001b[49m\u001b[43m=\u001b[49m\u001b[43mstop\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcallbacks\u001b[49m\u001b[43m=\u001b[49m\u001b[43mcallbacks\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
|
||||
"\u001b[36mFile \u001b[39m\u001b[32m~/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/langchain_core/language_models/chat_models.py:933\u001b[39m, in \u001b[36mBaseChatModel.generate\u001b[39m\u001b[34m(self, messages, stop, callbacks, tags, metadata, run_name, run_id, **kwargs)\u001b[39m\n\u001b[32m 930\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m i, m \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28menumerate\u001b[39m(input_messages):\n\u001b[32m 931\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m 932\u001b[39m results.append(\n\u001b[32m--> \u001b[39m\u001b[32m933\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_generate_with_cache\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 934\u001b[39m \u001b[43m \u001b[49m\u001b[43mm\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 935\u001b[39m \u001b[43m \u001b[49m\u001b[43mstop\u001b[49m\u001b[43m=\u001b[49m\u001b[43mstop\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 936\u001b[39m \u001b[43m \u001b[49m\u001b[43mrun_manager\u001b[49m\u001b[43m=\u001b[49m\u001b[43mrun_managers\u001b[49m\u001b[43m[\u001b[49m\u001b[43mi\u001b[49m\u001b[43m]\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mif\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mrun_managers\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01melse\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[32m 937\u001b[39m \u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 938\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 939\u001b[39m )\n\u001b[32m 940\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mBaseException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[32m 941\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m run_managers:\n",
|
||||
"\u001b[36mFile \u001b[39m\u001b[32m~/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/langchain_core/language_models/chat_models.py:1235\u001b[39m, in \u001b[36mBaseChatModel._generate_with_cache\u001b[39m\u001b[34m(self, messages, stop, run_manager, **kwargs)\u001b[39m\n\u001b[32m 1233\u001b[39m result = generate_from_stream(\u001b[38;5;28miter\u001b[39m(chunks))\n\u001b[32m 1234\u001b[39m \u001b[38;5;28;01melif\u001b[39;00m inspect.signature(\u001b[38;5;28mself\u001b[39m._generate).parameters.get(\u001b[33m\"\u001b[39m\u001b[33mrun_manager\u001b[39m\u001b[33m\"\u001b[39m):\n\u001b[32m-> \u001b[39m\u001b[32m1235\u001b[39m result = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_generate\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 1236\u001b[39m \u001b[43m \u001b[49m\u001b[43mmessages\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstop\u001b[49m\u001b[43m=\u001b[49m\u001b[43mstop\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mrun_manager\u001b[49m\u001b[43m=\u001b[49m\u001b[43mrun_manager\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\n\u001b[32m 1237\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 1238\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m 1239\u001b[39m result = \u001b[38;5;28mself\u001b[39m._generate(messages, stop=stop, **kwargs)\n",
|
||||
"\u001b[36mFile \u001b[39m\u001b[32m~/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/langchain_ollama/chat_models.py:1030\u001b[39m, in \u001b[36mChatOllama._generate\u001b[39m\u001b[34m(self, messages, stop, run_manager, **kwargs)\u001b[39m\n\u001b[32m 1023\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34m_generate\u001b[39m(\n\u001b[32m 1024\u001b[39m \u001b[38;5;28mself\u001b[39m,\n\u001b[32m 1025\u001b[39m messages: \u001b[38;5;28mlist\u001b[39m[BaseMessage],\n\u001b[32m (...)\u001b[39m\u001b[32m 1028\u001b[39m **kwargs: Any,\n\u001b[32m 1029\u001b[39m ) -> ChatResult:\n\u001b[32m-> \u001b[39m\u001b[32m1030\u001b[39m final_chunk = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_chat_stream_with_aggregation\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 1031\u001b[39m \u001b[43m \u001b[49m\u001b[43mmessages\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstop\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mrun_manager\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mverbose\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mverbose\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\n\u001b[32m 1032\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 1033\u001b[39m generation_info = final_chunk.generation_info\n\u001b[32m 1034\u001b[39m chat_generation = ChatGeneration(\n\u001b[32m 1035\u001b[39m message=AIMessage(\n\u001b[32m 1036\u001b[39m content=final_chunk.text,\n\u001b[32m (...)\u001b[39m\u001b[32m 1043\u001b[39m generation_info=generation_info,\n\u001b[32m 1044\u001b[39m )\n",
|
||||
"\u001b[36mFile \u001b[39m\u001b[32m~/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/langchain_ollama/chat_models.py:965\u001b[39m, in \u001b[36mChatOllama._chat_stream_with_aggregation\u001b[39m\u001b[34m(self, messages, stop, run_manager, verbose, **kwargs)\u001b[39m\n\u001b[32m 956\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34m_chat_stream_with_aggregation\u001b[39m(\n\u001b[32m 957\u001b[39m \u001b[38;5;28mself\u001b[39m,\n\u001b[32m 958\u001b[39m messages: \u001b[38;5;28mlist\u001b[39m[BaseMessage],\n\u001b[32m (...)\u001b[39m\u001b[32m 962\u001b[39m **kwargs: Any,\n\u001b[32m 963\u001b[39m ) -> ChatGenerationChunk:\n\u001b[32m 964\u001b[39m final_chunk = \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m965\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mchunk\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_iterate_over_stream\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmessages\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstop\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\u001b[43m:\u001b[49m\n\u001b[32m 966\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;28;43;01mif\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mfinal_chunk\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01mis\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m:\u001b[49m\n\u001b[32m 967\u001b[39m \u001b[43m \u001b[49m\u001b[43mfinal_chunk\u001b[49m\u001b[43m \u001b[49m\u001b[43m=\u001b[49m\u001b[43m \u001b[49m\u001b[43mchunk\u001b[49m\n",
|
||||
"\u001b[36mFile \u001b[39m\u001b[32m~/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/langchain_ollama/chat_models.py:1054\u001b[39m, in \u001b[36mChatOllama._iterate_over_stream\u001b[39m\u001b[34m(self, messages, stop, **kwargs)\u001b[39m\n\u001b[32m 1047\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34m_iterate_over_stream\u001b[39m(\n\u001b[32m 1048\u001b[39m \u001b[38;5;28mself\u001b[39m,\n\u001b[32m 1049\u001b[39m messages: \u001b[38;5;28mlist\u001b[39m[BaseMessage],\n\u001b[32m 1050\u001b[39m stop: \u001b[38;5;28mlist\u001b[39m[\u001b[38;5;28mstr\u001b[39m] | \u001b[38;5;28;01mNone\u001b[39;00m = \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[32m 1051\u001b[39m **kwargs: Any,\n\u001b[32m 1052\u001b[39m ) -> Iterator[ChatGenerationChunk]:\n\u001b[32m 1053\u001b[39m reasoning = kwargs.get(\u001b[33m\"\u001b[39m\u001b[33mreasoning\u001b[39m\u001b[33m\"\u001b[39m, \u001b[38;5;28mself\u001b[39m.reasoning)\n\u001b[32m-> \u001b[39m\u001b[32m1054\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mstream_resp\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_create_chat_stream\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmessages\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstop\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\u001b[43m:\u001b[49m\n\u001b[32m 1055\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;28;43;01mif\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;129;43;01mnot\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43misinstance\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mstream_resp\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mstr\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m:\u001b[49m\n\u001b[32m 1056\u001b[39m \u001b[43m \u001b[49m\u001b[43mcontent\u001b[49m\u001b[43m \u001b[49m\u001b[43m=\u001b[49m\u001b[43m \u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 1057\u001b[39m \u001b[43m \u001b[49m\u001b[43mstream_resp\u001b[49m\u001b[43m[\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mmessage\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m[\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mcontent\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\n\u001b[32m 1058\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;28;43;01mif\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mmessage\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mstream_resp\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01mand\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mcontent\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mstream_resp\u001b[49m\u001b[43m[\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mmessage\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\n\u001b[32m 1059\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;28;43;01melse\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\n\u001b[32m 1060\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n",
|
||||
"\u001b[36mFile \u001b[39m\u001b[32m~/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/langchain_ollama/chat_models.py:952\u001b[39m, in \u001b[36mChatOllama._create_chat_stream\u001b[39m\u001b[34m(self, messages, stop, **kwargs)\u001b[39m\n\u001b[32m 950\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m chat_params[\u001b[33m\"\u001b[39m\u001b[33mstream\u001b[39m\u001b[33m\"\u001b[39m]:\n\u001b[32m 951\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m._client:\n\u001b[32m--> \u001b[39m\u001b[32m952\u001b[39m \u001b[38;5;28;01myield from\u001b[39;00m \u001b[38;5;28mself\u001b[39m._client.chat(**chat_params)\n\u001b[32m 953\u001b[39m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28mself\u001b[39m._client:\n\u001b[32m 954\u001b[39m \u001b[38;5;28;01myield\u001b[39;00m \u001b[38;5;28mself\u001b[39m._client.chat(**chat_params)\n",
|
||||
"\u001b[36mFile \u001b[39m\u001b[32m~/PycharmProjects/assistance-engine/.venv/lib/python3.11/site-packages/ollama/_client.py:184\u001b[39m, in \u001b[36mClient._request.<locals>.inner\u001b[39m\u001b[34m()\u001b[39m\n\u001b[32m 182\u001b[39m part = json.loads(line)\n\u001b[32m 183\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m err := part.get(\u001b[33m'\u001b[39m\u001b[33merror\u001b[39m\u001b[33m'\u001b[39m):\n\u001b[32m--> \u001b[39m\u001b[32m184\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m ResponseError(err)\n\u001b[32m 185\u001b[39m \u001b[38;5;28;01myield\u001b[39;00m \u001b[38;5;28mcls\u001b[39m(**part)\n",
|
||||
"\u001b[31mResponseError\u001b[39m: failed to parse JSON: unexpected end of JSON input (status code: -1)",
|
||||
"During task with name 'agent' and id '9110cf29-5205-b67b-0456-234df433158a'"
|
||||
"What types of includes does AVAP have?\n",
|
||||
"[reformulate] 'What types of includes does AVAP have?' → '\"avap includes type\"'\n",
|
||||
"================================\u001b[1m Human Message \u001b[0m=================================\n",
|
||||
"\n",
|
||||
"What types of includes does AVAP have?\n",
|
||||
"[retrieve] 3 docs fetched\n",
|
||||
"[1] id=chunk-1 source=Untitled\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"Token:\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"ASSIGN\n",
|
||||
"\n",
|
||||
"[2] id=chunk-2 source=Untitled\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"> **Nota de implementación:** `<connector_instantiation>` se distingue de `<orm_connector_init>` (ORM) únicamente por contexto semántico: el UUID pasado como argumento determina si el adaptador resuelto es un ORM de base de datos o un proxy de terceros. La gramática los trata de forma idéntica; el motor de ejecución selecciona el adaptador apropiado en runtime.\n",
|
||||
"\n",
|
||||
"---\n",
|
||||
"\n",
|
||||
"## SECCIÓN VI: Utilidades, Criptografía y Manipulación de Datos\n",
|
||||
"\n",
|
||||
"AVAP incluye un set de comandos integrados de alto nivel para manipular tipos complejos (JSON y Listas), tiempos, textos y generar hashes.\n",
|
||||
"\n",
|
||||
"### 6.1 Manipulación Nativa de Listas y Objetos JSON\n",
|
||||
"Para extraer y mutar estructuras complejas, AVAP provee comandos nativos específicos:\n",
|
||||
"* **`variableToList(elemento, destino)`**: Fuerza a que una variable escalar se convierta en una estructura iterable de lista.\n",
|
||||
"* **`itemFromList(lista_origen, indice, destino)`**: Extrae de forma segura el elemento contenido en la posición `indice` de una lista.\n",
|
||||
"* **`variableFromJSON(json_origen, clave, destino)`**: Parsea un objeto JSON en memoria y extrae el valor correspondiente a la `clave`.\n",
|
||||
"* **`AddVariableToJSON(clave, valor, json_destino)`**: Inyecta dinámicamente una nueva propiedad dentro de un objeto JSON existente.\n",
|
||||
"\n",
|
||||
"### 6.2 Criptografía y Expresiones Regulares\n",
|
||||
"* **`encodeSHA256` y `encodeMD5(origen, destino)`**: Funciones criptográficas que encriptan de forma irreversible un texto. Vitales para el almacenamiento seguro de contraseñas.\n",
|
||||
"* **`getRegex(origen, patron, destino)`**: Aplica una Expresión Regular (`patron`) sobre la variable de origen, extrayendo las coincidencias exactas.\n",
|
||||
"\n",
|
||||
"### 6.3 Transformación de Tiempo y Cadenas\n",
|
||||
"* **Fechas:** `getTimeStamp` (convierte un string a Epoch), `getDateTime` (Epoch a string legible), y `stampToDatetime` (Epoch a objeto datetime estructurado). Soportan formatos de calendario y cálculos con TimeDeltas.\n",
|
||||
"* **Cadenas:** `replace` (saneamiento y sustitución de texto) y `randomString` (generación determinista de claves/tokens aleatorios).\n",
|
||||
"\n",
|
||||
"### Especificación BNF (Sección VI)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"/* [CORRECCIÓN] Todas las subreglas de <util_command> están ahora completamente expandidas. */\n",
|
||||
"<util_command> ::= <json_list_cmd> | <crypto_cmd> | <regex_cmd> | <datetime_cmd> | <stamp_cmd> | <string_cmd> | <replace_cmd>\n",
|
||||
"\n",
|
||||
"/* Manipulación de listas y JSON */\n",
|
||||
"<json_list_cmd> ::= \"variableToList(\" <expression> \",\" <identifier> \")\"\n",
|
||||
" | \"itemFromList(\" <identifier> \",\" <expression> \",\" <identifier> \")\"\n",
|
||||
" | \"variableFromJSON(\" <identifier> \",\" <expression> \",\" <identifier> \")\"\n",
|
||||
" | \"AddVariableToJSON(\" <expression> \",\" <expression> \",\" <identifier> \")\"\n",
|
||||
"\n",
|
||||
"/* Criptografía */\n",
|
||||
"<crypto_cmd> ::= \"encodeSHA256(\" <identifier_or_string> \",\" <identifier> \")\"\n",
|
||||
" | \"encodeMD5(\" <identifier_or_string> \",\" <identifier> \")\"\n",
|
||||
"\n",
|
||||
"/* Expresiones regulares */\n",
|
||||
"<regex_cmd> ::= \"getRegex(\" <identifier> \",\" <stringliteral> \",\" <identifier> \")\"\n",
|
||||
"\n",
|
||||
"<datetime_cmd> ::= \"getDateTime(\" <stringliteral> \",\" <expression> \",\" <stringliteral> \",\" <identifier> \")\"\n",
|
||||
"/* Argumentos: formato_salida, epoch_origen, zona_horaria, destino */\n",
|
||||
"\n",
|
||||
"<stamp_cmd> ::= \"stampToDatetime(\" <expression> \",\" <stringliteral> \",\" <expression> \",\" <identifier> \")\"\n",
|
||||
"/* Argumentos: epoch_origen, formato, timedelta, destino */\n",
|
||||
" | \"getTimeStamp(\" <stringliteral> \",\" <stringliteral> \",\" <expression> \",\" <identifier> \")\"\n",
|
||||
"/* Argumentos: fecha_string, formato_entrada, timedelta, destino */\n",
|
||||
"\n",
|
||||
"<string_cmd> ::= \"randomString(\" <expression> \",\" <identifier> \")\"\n",
|
||||
"/* Argumentos: longitud, destino */\n",
|
||||
"\n",
|
||||
"<replace_cmd> ::= \"replace(\" <identifier_or_string> \",\" <stringliteral> \",\" <stringliteral> \",\" <identifier> \")\"\n",
|
||||
"/* Argumentos: origen, patron_busqueda, reemplazo, destino */\n",
|
||||
"\n",
|
||||
"[3] id=chunk-3 source=Untitled\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"---\n",
|
||||
"\n",
|
||||
"## SECCIÓN IX: Expresiones y Gramática Léxica Estricta\n",
|
||||
"\n",
|
||||
"Esta sección es el corazón matemático evaluador de AVAP. Define la jerarquía exacta (Precedencia) y provee soporte nativo para características avanzadas similares a Python.\n",
|
||||
"\n",
|
||||
"### 9.1 Cast de Tipos Explícito\n",
|
||||
"AVAP permite conversiones de tipos (Type Casting) en cualquier evaluación utilizando funciones constructoras estándar. Puedes transformar variables dinámicamente usando `int(var)`, `float(var)` o `str(var)`.\n",
|
||||
"\n",
|
||||
"### 9.2 Slicing y Comprensiones (Comprehensions)\n",
|
||||
"* **Slicing (Cortes):** Puedes extraer fragmentos de listas o strings utilizando la notación de dos puntos. Ejemplo: `mi_lista[1:4]` (extrae desde el índice 1 hasta el 3).\n",
|
||||
"* **Comprehensions:** AVAP soporta la construcción rápida de listas mediante iteradores en una sola línea, permitiendo filtrar y mapear colecciones enteras (ej. `[x * 2 for x in valores if x > 0]`).\n",
|
||||
"\n",
|
||||
"### 9.3 Análisis Léxico (Lexer) y Documentación\n",
|
||||
"AVAP cuenta con tres niveles de descarte de texto para anotaciones humanas:\n",
|
||||
"1. **Comentarios de Línea (`//`):** Ignora el texto hasta el salto de línea.\n",
|
||||
"2. **Comentarios de Bloque (`/* ... */`):** Para aislar bloques enteros multilínea.\n",
|
||||
"3. **Comentarios de Documentación (`///`):** Utilizados por analizadores de código o IDEs para generar documentación técnica automática (Docstrings) a partir del código fuente.\n",
|
||||
"\n",
|
||||
"### Especificación BNF (Sección IX)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"/* Jerarquía de Expresiones (Precedencia de menor a mayor) */\n",
|
||||
"<expression> ::= <logical_or>\n",
|
||||
"<logical_or> ::= <logical_and> ( \"or\" <logical_and> )*\n",
|
||||
"<logical_and> ::= <logical_not> ( \"and\" <logical_not> )*\n",
|
||||
"<logical_not> ::= \"not\" <logical_not> | <comparison>\n",
|
||||
"\n",
|
||||
"<comparison> ::= <arithmetic> ( <comp_op> <arithmetic> )*\n",
|
||||
"<comp_op> ::= \"==\" | \"!=\" | \"<\" | \">\" | \"<=\" | \">=\" | \"in\" | \"is\"\n",
|
||||
"\n",
|
||||
"<arithmetic> ::= <term> ( ( \"+\" | \"-\" ) <term> )*\n",
|
||||
"<term> ::= <factor> ( ( \"*\" | \"/\" | \"%\" ) <factor> )*\n",
|
||||
"<factor> ::= ( \"+\" | \"-\" ) <factor> | <power>\n",
|
||||
"<power> ::= <primary> [ \"**\" <factor> ]\n",
|
||||
"\n",
|
||||
"/* Primarios y Átomos (Accesos, Castings, Slicing, Métodos y Funciones)\n",
|
||||
" La regla <primary> cubre también el acceso a métodos de objetos conector\n",
|
||||
" (conector.metodo(...)) y el acceso por clave a sus resultados (resultado[\"key\"]) */\n",
|
||||
"<primary> ::= <atom>\n",
|
||||
" | <primary> \".\" <identifier>\n",
|
||||
" | <primary> \"[\" <expression> \"]\"\n",
|
||||
" | <primary> \"[\" [<expression>] \":\" [<expression>] [\":\" [<expression>]] \"]\"\n",
|
||||
" | <primary> \"(\" [<argument_list>] \")\"\n",
|
||||
"\n",
|
||||
"<atom> ::= <identifier>\n",
|
||||
" | \"$\" <identifier>\n",
|
||||
" | <literal>\n",
|
||||
" | \"(\" <expression> \")\"\n",
|
||||
" | <list_display>\n",
|
||||
" | <dict_display>\n",
|
||||
"\n",
|
||||
"/* Estructuras de Datos, Comprensiones y Argumentos */\n",
|
||||
"<list_display> ::= \"[\" [<argument_list>] \"]\"\n",
|
||||
" | \"[\" <expression> \"for\" <identifier> \"in\" <expression> [<if_clause>] \"]\"\n",
|
||||
"<if_clause> ::= \"if\" <expression>\n",
|
||||
"<dict_display> ::= \"{\" [<key_datum_list>] \"}\"\n",
|
||||
"<key_datum_list> ::= <key_datum> ( \",\" <key_datum> )*\n",
|
||||
"<key_datum> ::= <expression> \":\" <expression>\n",
|
||||
"<argument_list> ::= <expression> ( \",\" <expression> )*\n",
|
||||
"\n",
|
||||
"/* Tipo numérico unificado */\n",
|
||||
"<number> ::= <floatnumber> | <integer>\n",
|
||||
"\n",
|
||||
"/* Literales (Tipos de Datos Primitivos Soportados) */\n",
|
||||
"<literal> ::= <stringliteral> | <number> | <boolean> | \"None\"\n",
|
||||
"<boolean> ::= \"True\" | \"False\"\n",
|
||||
"<integer> ::= [0-9]+\n",
|
||||
"<floatnumber> ::= [0-9]+ \".\" [0-9]* | \".\" [0-9]+\n",
|
||||
"\n",
|
||||
"/* Cadenas de Texto con soporte de secuencias de escape */\n",
|
||||
"<stringliteral> ::= \"\\\"\" <text_double> \"\\\"\" | \"'\" <text_single> \"'\"\n",
|
||||
"<escape_sequence> ::= \"\\\\\" ( \"\\\"\" | \"'\" | \"\\\\\" | \"n\" | \"t\" | \"r\" | \"0\" )\n",
|
||||
"<text_double> ::= ( [^\"\\\\] | <escape_sequence> )*\n",
|
||||
"<text_single> ::= ( [^'\\\\] | <escape_sequence> )*\n",
|
||||
"<identifier_or_string> ::= <identifier> | <stringliteral>\n",
|
||||
"\n",
|
||||
"/* Reglas de Comentarios para el Lexer\n",
|
||||
" El lexer aplica longest-match: /// debe evaluarse ANTES que // */\n",
|
||||
"<doc_comment> ::= \"///\" <any_text>\n",
|
||||
"<line_comment> ::= \"//\" <any_text>\n",
|
||||
"<block_comment> ::= \"/*\" <any_content> \"*/\"\n",
|
||||
"<any_text> ::= [^\\r\\n]*\n",
|
||||
"<any_content> ::= /* Cualquier secuencia de caracteres que no contenga la subcadena \"*/\" */\n",
|
||||
"================================\u001b[1m Human Message \u001b[0m=================================\n",
|
||||
"\n",
|
||||
"What types of includes does AVAP have?\n",
|
||||
"==================================\u001b[1m Ai Message \u001b[0m==================================\n",
|
||||
"\n",
|
||||
"AVAP has two main types of include:\n",
|
||||
"\n",
|
||||
"1. **<connector_instantiation>:** This is used to instantiate a connector, which could be for a database connection or a third-party API.\n",
|
||||
"2. **<orm_connector_init>:** This term seems to be related to initializing an Object-Relational Mapping (ORM) connector, indicating that the context suggests it's part of a specific ORM setup.\n",
|
||||
"\n",
|
||||
"Both types are treated similarly in terms of grammar but differ semantically by their purpose - one is for database connections or third-party APIs, while the other is specifically for connecting to ORMs. The engine selects the appropriate adapter based on runtime context.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"a = stream_graph_updates(user_input, agentic_graph)"
|
||||
"a = stream_graph_updates(user_input, guided_graph)"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
|
|
|||
|
|
@ -1,134 +1,60 @@
|
|||
from enum import Enum
|
||||
import typer
|
||||
import logging
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
from loguru import logger
|
||||
from elasticsearch import Elasticsearch
|
||||
from langchain_elasticsearch import ElasticsearchStore
|
||||
from chonkie import SemanticChunker
|
||||
|
||||
from src.utils.emb_factory import create_embedding_model
|
||||
from scripts.pipelines.tasks.chunk import (
|
||||
read_files,
|
||||
get_chunk_docs,
|
||||
convert_chunks_to_document
|
||||
fetch_documents,
|
||||
process_documents,
|
||||
export_documents,
|
||||
ingest_documents
|
||||
)
|
||||
|
||||
app = typer.Typer()
|
||||
|
||||
ELASTICSEARCH_LOCAL_URL = os.getenv("ELASTICSEARCH_LOCAL_URL")
|
||||
OLLAMA_LOCAL_URL = os.getenv("OLLAMA_LOCAL_URL")
|
||||
ELASTICSEARCH_INDEX = os.getenv("ELASTICSEARCH_INDEX")
|
||||
OLLAMA_URL = os.getenv("OLLAMA_URL")
|
||||
OLLAMA_EMB_MODEL_NAME = os.getenv("OLLAMA_EMB_MODEL_NAME")
|
||||
AVAP_WEB_DOCS_URL = os.getenv("AVAP_WEB_DOCS_URL")
|
||||
HF_EMB_MODEL_NAME = os.getenv("HF_EMB_MODEL_NAME")
|
||||
|
||||
class DistanceStrategy(str, Enum):
|
||||
euclidean = "EUCLIDEAN_DISTANCE"
|
||||
max_inner_product = "MAX_INNER_PRODUCT"
|
||||
dot_product = "DOT_PRODUCT"
|
||||
jaccard = "JACCARD"
|
||||
cosine = "COSINE"
|
||||
|
||||
|
||||
@app.command()
|
||||
def elasticsearch_ingestion(
|
||||
docs_folder_path: str = "docs",
|
||||
docs_folder_path: str = "docs/samples",
|
||||
output_path: str = "ingestion/chunks.json",
|
||||
docs_extension: list[str] = [".md", ".avap"],
|
||||
es_index: str = "avap-docs-test-v3",
|
||||
es_request_timeout: int = 120,
|
||||
es_max_retries: int = 5,
|
||||
es_retry_on_timeout: bool = True,
|
||||
distance_strategy: DistanceStrategy = DistanceStrategy.cosine,
|
||||
chunk_size: int = 2048,
|
||||
chunk_threshold: float = 0.5,
|
||||
chunk_similarity_window: int = 3,
|
||||
chunk_skip_window: int = 1,
|
||||
):
|
||||
delete_es_index: bool = True
|
||||
) -> None:
|
||||
"""
|
||||
Pipeline to ingest documents into an Elasticsearch index.
|
||||
The pipeline includes fetching documents from a specified folder, processing them into chunks, and then ingesting those chunks into the specified Elasticsearch index.
|
||||
|
||||
Args:
|
||||
docs_folder_path (str): Path to the folder containing documents to be ingested. Default is "docs/samples".
|
||||
docs_extension (list[str]): List of file extensions to filter by (e.g., [".md", ".avap"]). Default is [".md", ".avap"].
|
||||
es_index (str): Name of the Elasticsearch index to ingest documents into. Default is "avap-docs-test-v3".
|
||||
es_request_timeout (int): Timeout in seconds for Elasticsearch requests. Default is 120 seconds.
|
||||
es_max_retries (int): Maximum number of retries for Elasticsearch requests in case of failure. Default is 5 retries.
|
||||
es_retry_on_timeout (bool): Whether to retry Elasticsearch requests on timeout. Default is True.
|
||||
delete_es_index (bool): Whether to delete the existing Elasticsearch index before ingestion. Default is True.
|
||||
|
||||
Returns:
|
||||
None
|
||||
"""
|
||||
logger.info("Starting Elasticsearch ingestion pipeline...")
|
||||
logger.info(f"Reading and concatenating files from folder: {docs_folder_path}/developer.avapframework.com")
|
||||
avap_github_docs = read_files(f"{docs_folder_path}/avap_language_github_docs", concatenate=False)
|
||||
avap_web_docs_intro = read_files(f"{docs_folder_path}/developer.avapframework.com", "intro", concatenate=True)
|
||||
logger.info(f"Fetching files from {docs_folder_path}...")
|
||||
docs_path = fetch_documents(docs_folder_path, docs_extension)
|
||||
|
||||
# Check chapters in developer.avapframework.com folder and read and concatenate files for each chapter
|
||||
chapters = sorted({
|
||||
p.name.split("_")[0]
|
||||
for p in Path(f"{docs_folder_path}/developer.avapframework.com").glob("chapter*.md")
|
||||
})
|
||||
logger.info("Processing docs...")
|
||||
chunked_docs = process_documents(docs_path)
|
||||
|
||||
avap_web_docs_chapters = [
|
||||
item
|
||||
for chapter in chapters
|
||||
for item in read_files(
|
||||
f"{docs_folder_path}/developer.avapframework.com",
|
||||
f"{chapter}_",
|
||||
concatenate=True
|
||||
)
|
||||
]
|
||||
logger.info(f"Ingesting chunks in Elasticsearch index: {es_index}...")
|
||||
elasticsearch_docs = ingest_documents(chunked_docs, es_index, es_request_timeout, es_max_retries,
|
||||
es_retry_on_timeout, delete_es_index)
|
||||
|
||||
avap_web_docs_appendices = read_files(f"{docs_folder_path}/developer.avapframework.com", "appendices_", concatenate=False)
|
||||
avap_samples_docs = read_files(f"{docs_folder_path}/samples", concatenate=False)
|
||||
logger.info(f"Exporting processed documents to {output_path}...")
|
||||
export_documents(elasticsearch_docs, output_path)
|
||||
|
||||
logger.info("Instantiating semantic chunker...")
|
||||
chunker = SemanticChunker(
|
||||
embedding_model=HF_EMB_MODEL_NAME,
|
||||
chunk_size=chunk_size,
|
||||
threshold=chunk_threshold,
|
||||
similarity_window=chunk_similarity_window,
|
||||
skip_window=chunk_skip_window
|
||||
)
|
||||
|
||||
logger.info("Chunking AVAP GitHub docs...")
|
||||
avap_github_docs_chunks = get_chunk_docs(avap_github_docs, chunker)
|
||||
|
||||
logger.info("Chunking AVAP web docs chapters...")
|
||||
avap_web_docs_chapters_chunks = get_chunk_docs(avap_web_docs_chapters, chunker)
|
||||
|
||||
logger.info("Creating Langchain Document to index...")
|
||||
avap_github_langchain_docs = convert_chunks_to_document(avap_github_docs_chunks)
|
||||
avap_web_chapters_langchain_docs = convert_chunks_to_document(avap_web_docs_chapters_chunks)
|
||||
avap_web_intro_langchain_docs = convert_chunks_to_document(avap_web_docs_intro)
|
||||
avap_web_appendices_langchain_docs = convert_chunks_to_document(avap_web_docs_appendices)
|
||||
avap_samples_langchain_docs = convert_chunks_to_document(avap_samples_docs)
|
||||
avap_documents = avap_github_langchain_docs + avap_web_chapters_langchain_docs + avap_web_intro_langchain_docs + avap_web_appendices_langchain_docs + avap_samples_langchain_docs
|
||||
|
||||
logger.info("Connecting to Elasticsearch...")
|
||||
try:
|
||||
es = Elasticsearch(
|
||||
ELASTICSEARCH_LOCAL_URL,
|
||||
request_timeout=es_request_timeout,
|
||||
max_retries=es_max_retries,
|
||||
retry_on_timeout=es_retry_on_timeout,
|
||||
)
|
||||
except:
|
||||
logger.exception("Failed to connect to Elasticsearch.")
|
||||
raise
|
||||
|
||||
logger.info("Instantiating embeddings model...")
|
||||
try:
|
||||
embeddings = create_embedding_model(
|
||||
provider="ollama",
|
||||
model=OLLAMA_EMB_MODEL_NAME,
|
||||
base_url=OLLAMA_LOCAL_URL,
|
||||
)
|
||||
except:
|
||||
logger.exception("Failed to instantiate embeddings model.")
|
||||
raise
|
||||
|
||||
logger.info(f"Checking if index {ELASTICSEARCH_INDEX} exists and deleting if it does...")
|
||||
if es.indices.exists(index=ELASTICSEARCH_INDEX):
|
||||
es.indices.delete(index=ELASTICSEARCH_INDEX)
|
||||
|
||||
logger.info(f"Uploading documents to index {ELASTICSEARCH_INDEX}...")
|
||||
ElasticsearchStore.from_documents(
|
||||
avap_documents,
|
||||
embeddings,
|
||||
client=es,
|
||||
index_name=ELASTICSEARCH_INDEX,
|
||||
distance_strategy=distance_strategy.value,
|
||||
)
|
||||
logger.info(f"Finished uploading documents to index {ELASTICSEARCH_INDEX}.")
|
||||
logger.info(f"Finished ingesting in {es_index}.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
|
|||
|
|
@ -1,9 +1,9 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Use:
|
||||
python generate_mbpp_avap.py
|
||||
python generate_mbpp_avap.py --lrm path/to/avap.md
|
||||
python generate_mbpp_avap.py --lrm avap.md --output output/mbpp_avap.json --problems 300
|
||||
python generate_mbap.py
|
||||
python generate_mbap.py --lrm path/to/avap.md
|
||||
python generate_mbap.py --lrm avap.md --output output/mbpp_avap.json --problems 300
|
||||
|
||||
Requirements:
|
||||
pip install anthropic
|
||||
|
|
@ -53,7 +53,7 @@ REGLAS ESTRICTAS para el código AVAP generado:
|
|||
5. if() Modo 1: if(var_o_literal, var_o_literal, "operador")
|
||||
— los argumentos NO pueden ser expresiones de acceso como dict['key'];
|
||||
hay que extraer el valor a una variable propia primero.
|
||||
6. if() Modo 2: if(None, None, "expresion_completa_como_string")
|
||||
6. if() Modo 2: if(None, None, `expresion_completa_como_string`)
|
||||
7. _status se asigna con: addVar(_status, 404)
|
||||
8. ormAccessSelect firma: ormAccessSelect(campos, "tabla", selector, varTarget)
|
||||
— selector puede ser cadena vacía.
|
||||
|
|
@ -62,7 +62,7 @@ REGLAS ESTRICTAS para el código AVAP generado:
|
|||
|
||||
MODO DE EJECUCIÓN — MUY IMPORTANTE:
|
||||
- El código se ejecuta DIRECTAMENTE, línea a línea, sin servidor ni registro de endpoints.
|
||||
- NUNCA uses registerEndpoint(), NUNCA uses mainHandler(), NUNCA envuelvas el código en funciones solo para ejecutarlo salvo que queramos probar la funcionalidad de funciones.
|
||||
- NUNCA uses registerEndpoint(), NUNCA uses mainHandler(), NUNCA envuelvas el código en funciones solo para ejecutarlo.
|
||||
- El código correcto es simplemente las instrucciones en línea, por ejemplo:
|
||||
result = "Hello World"
|
||||
addResult(result)
|
||||
|
|
@ -82,29 +82,48 @@ Estructura exacta de cada elemento:
|
|||
"task_id": <número entero>,
|
||||
"text": "<enunciado del problema en español>",
|
||||
"code": "<código AVAP con saltos de línea como \\n>",
|
||||
"test_inputs": { "<param1>": <valor1>, "<param2>": <valor2> },
|
||||
"test_list": ["<expr_python_1>", "<expr_python_2>"]
|
||||
}
|
||||
|
||||
FORMATO DE test_inputs — MUY IMPORTANTE:
|
||||
- Es un objeto JSON con un valor fijo para cada variable que el código recibe via addParam().
|
||||
- Los nombres de las claves deben coincidir EXACTAMENTE con el nombre de variable usado en addParam().
|
||||
- Los valores deben ser concretos y representativos del problema (no genéricos como "test" o 123).
|
||||
- Si el código no tiene ningún addParam(), el campo test_inputs debe ser un objeto vacío: {}
|
||||
- Estos valores son los que el evaluador inyectará en el stack antes de ejecutar el código,
|
||||
de modo que las aserciones de test_list puedan validar las variables de salida resultantes.
|
||||
|
||||
Ejemplo con addParam:
|
||||
código: addParam("password", password)\\nencodeSHA256(password, hashed)\\naddResult(hashed)
|
||||
test_inputs: { "password": "secret123" }
|
||||
test_list: ["re.match(r'^[a-f0-9]{64}$', hashed)"]
|
||||
|
||||
Ejemplo sin addParam:
|
||||
código: randomString(16, token)\\naddResult(token)
|
||||
test_inputs: {}
|
||||
test_list: ["re.match(r'^[a-zA-Z0-9]{16}$', token)"]
|
||||
|
||||
FORMATO DE test_list — MUY IMPORTANTE:
|
||||
Cada aserción debe ser una expresión Python con re.match() o re.search()
|
||||
Cada aserción debe ser una expresión Python con re.match()
|
||||
evaluable directamente sobre las variables del stack AVAP (disponibles como
|
||||
variables Python locales). El módulo 're' está siempre disponible.
|
||||
La expresión debe devolver un match object (truthy) si el test pasa.
|
||||
|
||||
Reglas estrictas:
|
||||
- USA ÚNICAMENTE re.match(r'<patrón>', <variable>) o re.search(r'<patrón>', str(<variable>))
|
||||
- USA ÚNICAMENTE re.match(r'<patrón>', <variable>)
|
||||
- NO combines expresiones re.match en una aserción, cada asercion tiene que ser un unico re.match(r'<patrón>', <variable>)
|
||||
- Convierte a string si es necesario: re.match(r'^\\d+$', str(result))
|
||||
- Puedes encadenar con 'and': re.match(r'^[a-zA-Z0-9]{32}$', token) and re.match(r'.{32}', token)
|
||||
- Las variables referenciadas deben existir en el stack tras ejecutar el código.
|
||||
- NUNCA uses comparaciones directas (==, !=, >, <).
|
||||
- NUNCA uses isinstance(), len(), assert, ni texto descriptivo.
|
||||
- NUNCA uses nada que no sea re.match() o re.search().
|
||||
- NUNCA uses nada que no sea re.match().
|
||||
|
||||
Ejemplos correctos de test_list:
|
||||
"re.match(r'^[a-f0-9]{64}$', hashed)"
|
||||
"re.match(r'^[a-zA-Z0-9]{32}$', token)"
|
||||
"re.match(r'^\\d{4}-\\d{2}-\\d{2}$', date_str)"
|
||||
"re.search(r'Hello', result)"
|
||||
"re.match(r'^-?\\d+(\\.\\d+)?$', str(result))"
|
||||
"re.match(r'^(par|impar)$', result)"
|
||||
"re.match(r'^40[134]$', str(_status))"
|
||||
|
|
@ -138,22 +157,26 @@ Responde ÚNICAMENTE con el array JSON. Sin texto antes ni después.
|
|||
|
||||
def parse_response(raw: str):
|
||||
text = raw.strip()
|
||||
|
||||
if text.startswith("```"):
|
||||
lines = text.splitlines()
|
||||
inner = lines[1:]
|
||||
if inner and inner[-1].strip() == "```":
|
||||
inner = inner[:-1]
|
||||
text = "\n".join(inner).strip()
|
||||
|
||||
problems = json.loads(text)
|
||||
|
||||
if not isinstance(problems, list):
|
||||
raise ValueError("answer is not a JSON.")
|
||||
raise ValueError("response is not an JSON array")
|
||||
|
||||
for p in problems:
|
||||
for field in ("task_id", "text", "code", "test_list"):
|
||||
if field not in p:
|
||||
raise ValueError(f"field '{field}' not found in a problem.")
|
||||
raise ValueError(f"Field missing '{field}' in task_id={p.get('task_id','?')}.")
|
||||
if "test_inputs" not in p:
|
||||
p["test_inputs"] = {}
|
||||
if not isinstance(p["test_inputs"], dict):
|
||||
raise ValueError(f"'test_inputs' must by a JSON Object (task_id={p.get('task_id','?')}).")
|
||||
|
||||
return problems
|
||||
|
||||
|
|
@ -8,8 +8,7 @@ from botocore.config import Config
|
|||
from pathlib import Path
|
||||
from langchain_core.messages import SystemMessage, HumanMessage
|
||||
from src.utils.llm_factory import create_chat_model
|
||||
from src.config import RAW_DIR, INTERIM_DIR
|
||||
from scripts.pipelines.input.prompts import get_prompt_mbpp
|
||||
from scripts.pipelines.tasks.prompts import get_prompt_mbpp
|
||||
|
||||
|
||||
app = typer.Typer()
|
||||
|
|
|
|||
|
|
@ -1,136 +1,277 @@
|
|||
import os
|
||||
import re
|
||||
import uuid
|
||||
import json
|
||||
from copy import deepcopy
|
||||
from dataclasses import replace
|
||||
from pathlib import Path
|
||||
from typing import Any, Union
|
||||
|
||||
from loguru import logger
|
||||
from chonkie import Chunk, SemanticChunker
|
||||
from langchain_core.documents import Document
|
||||
|
||||
|
||||
def replace_javascript_with_avap(text: str) -> str:
|
||||
"""
|
||||
Replace mentions of javascript language with avap in the text.
|
||||
Handles code blocks, language identifiers, and references.
|
||||
|
||||
Args:
|
||||
text: The text to process.
|
||||
|
||||
Returns:
|
||||
The text with javascript references replaced with avap.
|
||||
"""
|
||||
# Replace ```javascript with ```avap
|
||||
text = text.replace("```javascript", "```avap")
|
||||
|
||||
# Replace ```js with ```avap
|
||||
text = text.replace("```js", "```avap")
|
||||
|
||||
# Replace common phrases (case-insensitive)
|
||||
text = re.sub(r"\bjavascript\s+code\b", "avap code", text, flags=re.IGNORECASE)
|
||||
text = re.sub(
|
||||
r"\bjavascript\s+example\b", "avap example", text, flags=re.IGNORECASE
|
||||
from chonkie import (
|
||||
Chunk,
|
||||
ElasticHandshake,
|
||||
FileFetcher,
|
||||
MarkdownChef,
|
||||
TextChef,
|
||||
TokenChunker,
|
||||
MarkdownDocument
|
||||
)
|
||||
text = re.sub(r"\bjavascript\b(?!\s+file)", "avap", text, flags=re.IGNORECASE)
|
||||
from elasticsearch import Elasticsearch
|
||||
from loguru import logger
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
return text
|
||||
from scripts.pipelines.tasks.embeddings import OllamaEmbeddings
|
||||
from src.config import settings
|
||||
|
||||
|
||||
def read_files(
|
||||
folder_path: str, file_prefix: str | None = None, concatenate: bool = True
|
||||
) -> list[dict]:
|
||||
def _get_text(element) -> str:
|
||||
for attr in ("text", "content", "markdown"):
|
||||
value = getattr(element, attr, None)
|
||||
if isinstance(value, str):
|
||||
return value
|
||||
raise AttributeError(
|
||||
f"Could not extract text from element of type {type(element).__name__}"
|
||||
)
|
||||
|
||||
|
||||
def _merge_markdown_document(processed_doc: MarkdownDocument) -> MarkdownDocument:
|
||||
elements = []
|
||||
|
||||
for chunk in processed_doc.chunks:
|
||||
elements.append(("chunk", chunk.start_index, chunk.end_index, chunk))
|
||||
|
||||
for code in processed_doc.code:
|
||||
elements.append(("code", code.start_index, code.end_index, code))
|
||||
|
||||
for table in processed_doc.tables:
|
||||
elements.append(("table", table.start_index, table.end_index, table))
|
||||
|
||||
elements.sort(key=lambda item: (item[1], item[2]))
|
||||
|
||||
merged_chunks = []
|
||||
current_chunk = None
|
||||
current_parts = []
|
||||
current_end_index = None
|
||||
current_token_count = None
|
||||
|
||||
def flush():
|
||||
nonlocal current_chunk, current_parts, current_end_index, current_token_count
|
||||
|
||||
if current_chunk is None:
|
||||
return
|
||||
|
||||
merged_text = "\n\n".join(part for part in current_parts if part)
|
||||
|
||||
merged_chunks.append(
|
||||
replace(
|
||||
current_chunk,
|
||||
text=merged_text,
|
||||
end_index=current_end_index,
|
||||
token_count=current_token_count,
|
||||
)
|
||||
)
|
||||
|
||||
current_chunk = None
|
||||
current_parts = []
|
||||
current_end_index = None
|
||||
current_token_count = None
|
||||
|
||||
for kind, _, _, element in elements:
|
||||
if kind == "chunk":
|
||||
flush()
|
||||
current_chunk = element
|
||||
current_parts = [_get_text(element)]
|
||||
current_end_index = element.end_index
|
||||
current_token_count = element.token_count
|
||||
continue
|
||||
|
||||
if current_chunk is None:
|
||||
continue
|
||||
|
||||
current_parts.append(_get_text(element))
|
||||
current_end_index = max(current_end_index, element.end_index)
|
||||
current_token_count += getattr(element, "token_count", 0)
|
||||
|
||||
flush()
|
||||
|
||||
fused_processed_doc = deepcopy(processed_doc)
|
||||
fused_processed_doc.chunks = merged_chunks
|
||||
fused_processed_doc.code = processed_doc.code
|
||||
fused_processed_doc.tables = processed_doc.tables
|
||||
|
||||
return fused_processed_doc
|
||||
|
||||
|
||||
class ElasticHandshakeWithMetadata(ElasticHandshake):
|
||||
"""Extended ElasticHandshake that preserves chunk metadata in Elasticsearch."""
|
||||
|
||||
def _create_bulk_actions(self, chunks: list[dict]) -> list[dict[str, Any]]:
|
||||
"""Generate bulk actions including metadata."""
|
||||
actions = []
|
||||
embeddings = self.embedding_model.embed_batch([chunk["chunk"].text for chunk in chunks])
|
||||
|
||||
for i, chunk in enumerate(chunks):
|
||||
source = {
|
||||
"text": chunk["chunk"].text,
|
||||
"embedding": embeddings[i],
|
||||
"start_index": chunk["chunk"].start_index,
|
||||
"end_index": chunk["chunk"].end_index,
|
||||
"token_count": chunk["chunk"].token_count,
|
||||
}
|
||||
|
||||
# Include metadata if it exists
|
||||
if chunk.get("extra_metadata"):
|
||||
source.update(chunk["extra_metadata"])
|
||||
|
||||
actions.append({
|
||||
"_index": self.index_name,
|
||||
"_id": self._generate_id(i, chunk["chunk"]),
|
||||
"_source": source,
|
||||
})
|
||||
|
||||
return actions
|
||||
|
||||
def write(self, chunks: Union[Chunk, list[Chunk]]) -> list[dict[str, Any]]:
|
||||
"""Write the chunks to the Elasticsearch index using the bulk API."""
|
||||
if isinstance(chunks, Chunk):
|
||||
chunks = [chunks]
|
||||
|
||||
actions = self._create_bulk_actions(chunks)
|
||||
|
||||
# Use the bulk helper to efficiently write the documents
|
||||
from elasticsearch.helpers import bulk
|
||||
|
||||
success, errors = bulk(self.client, actions, raise_on_error=False)
|
||||
|
||||
if errors:
|
||||
logger.warning(f"Encountered {len(errors)} errors during bulk indexing.") # type: ignore
|
||||
# Optionally log the first few errors for debugging
|
||||
for i, error in enumerate(errors[:5]): # type: ignore
|
||||
logger.error(f"Error {i + 1}: {error}")
|
||||
|
||||
logger.info(f"Chonkie wrote {success} chunks to Elasticsearch index: {self.index_name}")
|
||||
|
||||
return actions
|
||||
|
||||
|
||||
def fetch_documents(docs_folder_path: str, docs_extension: list[str]) -> list[Path]:
|
||||
"""
|
||||
Read files in a folder whose names start with a given prefix.
|
||||
Replaces javascript language markers with avap.
|
||||
Fetch files from a folder that match the specified extensions.
|
||||
|
||||
Args:
|
||||
folder_path: Path to the folder to search in.
|
||||
file_prefix: The prefix that file names must start with.
|
||||
If None, all files in the folder are included.
|
||||
concatenate: Whether to concatenate the contents of the files.
|
||||
docs_folder_path (str): Path to the folder containing documents
|
||||
docs_extension (list[str]): List of file extensions to filter by (e.g., [".md", ".avap"])
|
||||
|
||||
Returns:
|
||||
A list of dictionaries, each containing 'content' and 'title' keys.
|
||||
If concatenate is True, returns a single dict with concatenated content and title as 'appendix'.
|
||||
If concatenate is False, returns one dict per file with filename as title.
|
||||
List of Paths to the fetched documents
|
||||
"""
|
||||
contents = []
|
||||
filenames = []
|
||||
fetcher = FileFetcher()
|
||||
docs_path = fetcher.fetch(dir=f"{settings.proj_root}/{docs_folder_path}", ext=docs_extension)
|
||||
return docs_path
|
||||
|
||||
for filename in sorted(os.listdir(folder_path)):
|
||||
include_file = file_prefix is None or filename.startswith(file_prefix)
|
||||
if include_file:
|
||||
file_path = os.path.join(folder_path, filename)
|
||||
if os.path.isfile(file_path):
|
||||
with open(file_path, "r", encoding="utf-8") as f:
|
||||
content = f.read()
|
||||
cleaned_content = content.strip()
|
||||
if cleaned_content:
|
||||
contents.append(cleaned_content)
|
||||
filenames.append(filename)
|
||||
|
||||
if concatenate:
|
||||
concatenated = "\n".join(contents)
|
||||
processed_content = replace_javascript_with_avap(concatenated)
|
||||
title = file_prefix if file_prefix is not None else "all_files"
|
||||
return [{"content": processed_content, "title": title}]
|
||||
def process_documents(docs_path: list[Path]) -> list[dict[str, Chunk | dict[str, Any]]]:
|
||||
"""
|
||||
Process documents by applying appropriate chefs and chunking strategies based on file type.
|
||||
|
||||
Args:
|
||||
docs_path (list[Path]): List of Paths to the documents to be processed
|
||||
|
||||
Returns:
|
||||
List of dicts with "chunk" (Chunk object) and "metadata" (dict with file info)
|
||||
"""
|
||||
processed_docs = []
|
||||
custom_tokenizer = AutoTokenizer.from_pretrained(settings.hf_emb_model_name)
|
||||
chef_md = MarkdownChef(tokenizer=custom_tokenizer)
|
||||
chef_txt = TextChef()
|
||||
chunker = TokenChunker(tokenizer=custom_tokenizer)
|
||||
|
||||
for doc_path in docs_path:
|
||||
doc_extension = doc_path.suffix.lower()
|
||||
filename = doc_path.name
|
||||
|
||||
if doc_extension == ".md":
|
||||
processed_doc = chef_md.process(doc_path)
|
||||
fused_doc = _merge_markdown_document(processed_doc)
|
||||
chunked_doc = fused_doc.chunks
|
||||
elif doc_extension == ".avap":
|
||||
processed_doc = chef_txt.process(doc_path)
|
||||
chunked_doc = chunker.chunk(processed_doc.content)
|
||||
else:
|
||||
return [
|
||||
{"content": replace_javascript_with_avap(content), "title": filename}
|
||||
for content, filename in zip(contents, filenames)
|
||||
]
|
||||
continue
|
||||
|
||||
for chunk in chunked_doc:
|
||||
processed_docs.append({
|
||||
"chunk": chunk,
|
||||
"extra_metadata": {"file": filename}
|
||||
})
|
||||
|
||||
return processed_docs
|
||||
|
||||
|
||||
def get_chunk_docs(docs: list[dict], chunker: SemanticChunker) -> list[list[Chunk]]:
|
||||
def ingest_documents(
|
||||
chunked_docs: list[dict[str, Chunk | dict[str, Any]]],
|
||||
es_index: str,
|
||||
es_request_timeout: int,
|
||||
es_max_retries: int,
|
||||
es_retry_on_timeout: bool,
|
||||
delete_es_index: bool,
|
||||
) -> list[dict[str, Any]]:
|
||||
"""
|
||||
Chunk the content of the documents using the provided chunker.
|
||||
Ingest processed documents into an Elasticsearch index.
|
||||
|
||||
Args:
|
||||
docs: A list of dictionaries, each containing 'content' and 'title' keys.
|
||||
chunker: An instance of SemanticChunker to use for chunking the content.
|
||||
chunked_docs (list[dict[str, Any]]): List of dicts with "chunk" and "metadata" keys
|
||||
es_index (str): Name of the Elasticsearch index to ingest into
|
||||
es_request_timeout (int): Timeout for Elasticsearch requests in seconds
|
||||
es_max_retries (int): Maximum number of retries for Elasticsearch requests
|
||||
es_retry_on_timeout (bool): Whether to retry on Elasticsearch request timeouts
|
||||
delete_es_index (bool): Whether to delete the existing Elasticsearch index before ingestion
|
||||
|
||||
Returns:
|
||||
A list of lists of Chunk objects, where each inner list corresponds to the chunks of a
|
||||
single document.
|
||||
List of dicts with Elasticsearch response for each chunk
|
||||
"""
|
||||
list_chunks = []
|
||||
logger.info(
|
||||
f"Instantiating Elasticsearch client with URL: {settings.elasticsearch_local_url}..."
|
||||
)
|
||||
es = Elasticsearch(
|
||||
hosts=settings.elasticsearch_local_url,
|
||||
request_timeout=es_request_timeout,
|
||||
max_retries=es_max_retries,
|
||||
retry_on_timeout=es_retry_on_timeout,
|
||||
)
|
||||
|
||||
for doc in docs:
|
||||
content = doc["content"]
|
||||
chunks = chunker.chunk(content)
|
||||
for chunk in chunks:
|
||||
chunk.context = {"source": doc["title"]}
|
||||
list_chunks.append(chunks)
|
||||
logger.info(f"Finished chunking {doc['title']}")
|
||||
if delete_es_index and es.indices.exists(index=es_index):
|
||||
logger.info(f"Deleting existing Elasticsearch index: {es_index}...")
|
||||
es.indices.delete(index=es_index)
|
||||
|
||||
return list_chunks
|
||||
handshake = ElasticHandshakeWithMetadata(
|
||||
client=es,
|
||||
index_name=es_index,
|
||||
embedding_model=OllamaEmbeddings(model=settings.ollama_emb_model_name),
|
||||
)
|
||||
|
||||
logger.info(
|
||||
f"Ingesting {len(chunked_docs)} chunks into Elasticsearch index: {es_index}..."
|
||||
)
|
||||
elasticsearch_chunks = handshake.write(chunked_docs)
|
||||
|
||||
return elasticsearch_chunks
|
||||
|
||||
|
||||
def convert_chunks_to_document(chunks: list[dict] | list[list[Chunk]]) -> list[Document]:
|
||||
def export_documents(elasticsearch_chunks: list[dict[str, Any]], output_path: str) -> None:
|
||||
"""
|
||||
Convert the chunked content into a list of Document objects.
|
||||
Export processed documents to JSON files in the specified output folder.
|
||||
|
||||
Args:
|
||||
chunks: A list of dictionaries containing 'content' and 'title' keys.
|
||||
|
||||
elasticsearch_chunks (list[dict[str, Any]]): List of dicts with Elasticsearch response for each chunk
|
||||
output_path (str): Path to the file where the JSON will be saved
|
||||
Returns:
|
||||
A list of Document objects created from the chunked content.
|
||||
None
|
||||
"""
|
||||
documents = []
|
||||
output_path = settings.proj_root / output_path
|
||||
|
||||
if isinstance(chunks[0], dict):
|
||||
for chunk in chunks:
|
||||
content = chunk["content"]
|
||||
title = chunk["title"]
|
||||
documents.append(Document(id=str(uuid.uuid4()),
|
||||
page_content=content,
|
||||
metadata={"source": title}))
|
||||
for chunk in elasticsearch_chunks:
|
||||
chunk["_source"]["embedding"] = chunk["_source"]["embedding"].tolist() # For JSON serialization
|
||||
|
||||
else:
|
||||
for chunk_list in chunks:
|
||||
for chunk in chunk_list:
|
||||
content = chunk.text
|
||||
title = chunk.context.get("source", "unknown")
|
||||
documents.append(Document(id=str(uuid.uuid4()),
|
||||
page_content=content,
|
||||
metadata={"source": title}))
|
||||
with output_path.open("w", encoding="utf-8") as f:
|
||||
json.dump(elasticsearch_chunks, f, ensure_ascii=False, indent=4)
|
||||
|
||||
return documents
|
||||
logger.info(f"Exported processed documents to {output_path}")
|
||||
|
|
|
|||
|
|
@ -0,0 +1,125 @@
|
|||
import requests
|
||||
from typing import Any, Callable
|
||||
|
||||
import numpy as np
|
||||
from chonkie.embeddings import BaseEmbeddings
|
||||
|
||||
from src.config import settings
|
||||
|
||||
|
||||
class OllamaEmbeddings(BaseEmbeddings):
|
||||
"""Chonkie embeddings adapter for a local Ollama embedding model."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model: str,
|
||||
base_url: str = settings.ollama_local_url,
|
||||
timeout: float = 60.0,
|
||||
truncate: bool = True,
|
||||
keep_alive: str = "5m",
|
||||
) -> None:
|
||||
self.model = model
|
||||
self.base_url = base_url.rstrip("/")
|
||||
self.timeout = timeout
|
||||
self.truncate = truncate
|
||||
self.keep_alive = keep_alive
|
||||
self._dimension: int | None = None
|
||||
|
||||
@property
|
||||
def dimension(self) -> int:
|
||||
if self._dimension is None:
|
||||
# Lazy-load the dimension from a real embedding response.
|
||||
self._dimension = int(self.embed(" ").shape[0])
|
||||
return self._dimension
|
||||
|
||||
def embed(self, text: str) -> np.ndarray:
|
||||
embeddings = self._embed_api(text)
|
||||
vector = np.asarray(embeddings[0], dtype=np.float32)
|
||||
|
||||
if self._dimension is None:
|
||||
self._dimension = int(vector.shape[0])
|
||||
|
||||
return vector
|
||||
|
||||
def embed_batch(self, texts: list[str]) -> list[np.ndarray]:
|
||||
if not texts:
|
||||
return []
|
||||
|
||||
embeddings = self._embed_api(texts)
|
||||
vectors = [np.asarray(vector, dtype=np.float32) for vector in embeddings]
|
||||
|
||||
if vectors and self._dimension is None:
|
||||
self._dimension = int(vectors[0].shape[0])
|
||||
|
||||
return vectors
|
||||
|
||||
def count_tokens(self, text: str) -> int:
|
||||
payload = self._build_payload(text)
|
||||
response = self._post_embed(payload)
|
||||
return int(response["prompt_eval_count"])
|
||||
|
||||
def count_tokens_batch(self, texts: list[str]) -> list[int]:
|
||||
# Ollama returns a single prompt_eval_count for the whole request,
|
||||
# not one count per input item, so we compute them individually.
|
||||
return [self.count_tokens(text) for text in texts]
|
||||
|
||||
def get_tokenizer(self) -> Callable[[str], int]:
|
||||
# Chonkie mainly needs something usable for token counting.
|
||||
return self.count_tokens
|
||||
|
||||
@classmethod
|
||||
def is_available(cls) -> bool:
|
||||
try:
|
||||
response = requests.get(
|
||||
f"{settings.ollama_local_url}/api/tags",
|
||||
timeout=5.0,
|
||||
)
|
||||
response.raise_for_status()
|
||||
return True
|
||||
except requests.RequestException:
|
||||
return False
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return (
|
||||
f"OllamaEmbeddings("
|
||||
f"model={self.model!r}, "
|
||||
f"base_url={self.base_url!r}, "
|
||||
f"dimension={self._dimension!r}"
|
||||
f")"
|
||||
)
|
||||
|
||||
def _build_payload(self, text_or_texts: str | list[str]) -> dict[str, Any]:
|
||||
return {
|
||||
"model": self.model,
|
||||
"input": text_or_texts,
|
||||
"truncate": self.truncate,
|
||||
"keep_alive": self.keep_alive,
|
||||
}
|
||||
|
||||
def _post_embed(self, payload: dict[str, Any]) -> dict[str, Any]:
|
||||
try:
|
||||
response = requests.post(
|
||||
f"{self.base_url}/api/embed",
|
||||
json=payload,
|
||||
timeout=self.timeout,
|
||||
)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
except requests.RequestException as exc:
|
||||
raise RuntimeError(
|
||||
f"Failed to call Ollama embeddings endpoint at "
|
||||
f"{self.base_url}/api/embed"
|
||||
) from exc
|
||||
|
||||
if "embeddings" not in data:
|
||||
raise RuntimeError(
|
||||
"Ollama response did not include 'embeddings'. "
|
||||
f"Response keys: {list(data.keys())}"
|
||||
)
|
||||
|
||||
return data
|
||||
|
||||
def _embed_api(self, text_or_texts: str | list[str]) -> list[list[float]]:
|
||||
payload = self._build_payload(text_or_texts)
|
||||
data = self._post_embed(payload)
|
||||
return data["embeddings"]
|
||||
144
src/config.py
144
src/config.py
|
|
@ -1,39 +1,29 @@
|
|||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
from pydantic_settings import BaseSettings, SettingsConfigDict
|
||||
from pydantic import Field
|
||||
from dotenv import load_dotenv
|
||||
from datetime import timedelta
|
||||
import warnings
|
||||
|
||||
|
||||
load_dotenv()
|
||||
|
||||
class Settings(BaseSettings):
|
||||
raw_path_: str
|
||||
data_path_: str
|
||||
processed_path_: str
|
||||
models_path_: str
|
||||
external_path_: str
|
||||
kubeconfig_path: str
|
||||
interim_path_: str
|
||||
database_url: str
|
||||
openai_api_key: str
|
||||
elasticsearch_index: str
|
||||
elasticsearch_docs_index: str
|
||||
elasticsearch_code_index: str
|
||||
llm_base_url: str
|
||||
ollama_url: str
|
||||
ollama_local_url: str
|
||||
langfuse_host: str
|
||||
data_path_: Optional[str] = None
|
||||
raw_path_: Optional[str] = None
|
||||
processed_path_: Optional[str] = None
|
||||
models_path_: Optional[str] = None
|
||||
external_path_: Optional[str] = None
|
||||
interim_path_: Optional[str] = None
|
||||
kubeconfig_path_: Optional[str] = None
|
||||
postgres_url: str
|
||||
elasticsearch_url: str
|
||||
elasticsearch_local_url: str
|
||||
ollama_url: str
|
||||
ollama_local_url: str
|
||||
ollama_model_name: str
|
||||
ollama_emb_model_name: str
|
||||
model_name: str
|
||||
hf_emb_model_name: str
|
||||
langfuse_host: str
|
||||
langfuse_public_key: str
|
||||
langfuse_secret_key: str
|
||||
hf_token: str
|
||||
hf_emb_model_name: str
|
||||
|
||||
model_config = SettingsConfigDict(
|
||||
env_file=".env",
|
||||
|
|
@ -43,108 +33,40 @@ class Settings(BaseSettings):
|
|||
)
|
||||
|
||||
@property
|
||||
def data_path(self) -> Path:
|
||||
return Path(self.data_path_)
|
||||
|
||||
@property
|
||||
def models_path(self) -> Path:
|
||||
return Path(self.models_path_)
|
||||
|
||||
@property
|
||||
def processed_path(self) -> Path:
|
||||
return Path(self.processed_path_)
|
||||
|
||||
@property
|
||||
def raw_path(self) -> Path:
|
||||
return Path(self.raw_path_)
|
||||
|
||||
@property
|
||||
def interim_path(self) -> Path:
|
||||
return Path(self.interim_path_)
|
||||
|
||||
@property
|
||||
def external_path(self) -> Path:
|
||||
return Path(self.external_path_)
|
||||
|
||||
@property
|
||||
def proj_root(self) -> Path:
|
||||
def project_root(self) -> Path:
|
||||
return Path(__file__).resolve().parents[1]
|
||||
|
||||
@property
|
||||
def database_url(self) -> str:
|
||||
return self.database_url
|
||||
def _resolve_path(self, path: Optional[str]) -> Optional[Path]:
|
||||
if path is None:
|
||||
return None
|
||||
return self.project_root / path
|
||||
|
||||
@property
|
||||
def openai_api_key(self) -> str:
|
||||
return self.openai_api_key
|
||||
def data_path(self) -> Optional[Path]:
|
||||
return self._resolve_path(self.data_path_)
|
||||
|
||||
@property
|
||||
def elasticsearch_index(self) -> str:
|
||||
return self.elasticsearch_index
|
||||
def raw_path(self) -> Optional[Path]:
|
||||
return self._resolve_path(self.raw_path_)
|
||||
|
||||
@property
|
||||
def elasticsearch_docs_index(self) -> str:
|
||||
return self.elasticsearch_docs_index
|
||||
def processed_path(self) -> Optional[Path]:
|
||||
return self._resolve_path(self.processed_path_)
|
||||
|
||||
@property
|
||||
def elasticsearch_code_index(self) -> str:
|
||||
return self.elasticsearch_code_index
|
||||
def models_path(self) -> Optional[Path]:
|
||||
return self._resolve_path(self.models_path_)
|
||||
|
||||
@property
|
||||
def llm_base_url(self) -> str:
|
||||
return self.llm_base_url
|
||||
def external_path(self) -> Optional[Path]:
|
||||
return self._resolve_path(self.external_path_)
|
||||
|
||||
@property
|
||||
def ollama_url(self) -> str:
|
||||
return self.ollama_url
|
||||
def interim_path(self) -> Optional[Path]:
|
||||
return self._resolve_path(self.interim_path_)
|
||||
|
||||
@property
|
||||
def ollama_local_url(self) -> str:
|
||||
return self.ollama_local_url
|
||||
|
||||
@property
|
||||
def langfuse_host(self) -> str:
|
||||
return self.langfuse_host
|
||||
|
||||
@property
|
||||
def elasticsearch_url(self) -> str:
|
||||
return self.elasticsearch_url
|
||||
|
||||
@property
|
||||
def elasticsearch_local_url(self) -> str:
|
||||
return self.elasticsearch_local_url
|
||||
|
||||
@property
|
||||
def ollama_model_name(self) -> str:
|
||||
return self.ollama_model_name
|
||||
|
||||
@property
|
||||
def ollama_emb_model_name(self) -> str:
|
||||
return self.ollama_emb_model_name
|
||||
|
||||
@property
|
||||
def model_name(self) -> str:
|
||||
return self.model_name
|
||||
|
||||
@property
|
||||
def hf_emb_model_name(self) -> str:
|
||||
return self.hf_emb_model_name
|
||||
|
||||
@property
|
||||
def langfuse_public_key(self) -> str:
|
||||
return self.langfuse_public_key
|
||||
|
||||
@property
|
||||
def langfuse_secret_key(self) -> str:
|
||||
return self.langfuse_secret_key
|
||||
|
||||
@property
|
||||
def hf_token(self) -> str:
|
||||
return self.hf_token
|
||||
|
||||
@property
|
||||
def kubeconfig_path(self) -> Path:
|
||||
return Path(self.kubeconfig_path)
|
||||
|
||||
def kubeconfig_path(self) -> Optional[Path]:
|
||||
return self._resolve_path(self.kubeconfig_path_)
|
||||
|
||||
settings = Settings()
|
||||
29
uv.lock
29
uv.lock
|
|
@ -250,7 +250,6 @@ name = "assistance-engine"
|
|||
version = "0.1.0"
|
||||
source = { virtual = "." }
|
||||
dependencies = [
|
||||
{ name = "chonkie", extra = ["semantic"] },
|
||||
{ name = "grpcio" },
|
||||
{ name = "grpcio-reflection" },
|
||||
{ name = "grpcio-tools" },
|
||||
|
|
@ -273,7 +272,9 @@ dependencies = [
|
|||
dev = [
|
||||
{ name = "beir" },
|
||||
{ name = "boto3" },
|
||||
{ name = "chonkie", extra = ["elastic", "semantic"] },
|
||||
{ name = "evidently" },
|
||||
{ name = "flatbuffers" },
|
||||
{ name = "jupyter" },
|
||||
{ name = "langfuse" },
|
||||
{ name = "litellm" },
|
||||
|
|
@ -288,7 +289,6 @@ dev = [
|
|||
|
||||
[package.metadata]
|
||||
requires-dist = [
|
||||
{ name = "chonkie", extras = ["semantic"], specifier = ">=1.5.6" },
|
||||
{ name = "grpcio", specifier = ">=1.78.0" },
|
||||
{ name = "grpcio-reflection", specifier = ">=1.78.0" },
|
||||
{ name = "grpcio-tools", specifier = ">=1.78.0" },
|
||||
|
|
@ -311,7 +311,9 @@ requires-dist = [
|
|||
dev = [
|
||||
{ name = "beir", specifier = ">=2.2.0" },
|
||||
{ name = "boto3", specifier = ">=1.42.58" },
|
||||
{ name = "chonkie", extras = ["elastic", "semantic"], specifier = ">=1.6.0" },
|
||||
{ name = "evidently", specifier = ">=0.7.20" },
|
||||
{ name = "flatbuffers", specifier = ">=25.12.19" },
|
||||
{ name = "jupyter", specifier = ">=1.1.1" },
|
||||
{ name = "langfuse", specifier = "<3" },
|
||||
{ name = "litellm", specifier = ">=1.82.0" },
|
||||
|
|
@ -595,7 +597,7 @@ wheels = [
|
|||
|
||||
[[package]]
|
||||
name = "chonkie"
|
||||
version = "1.5.6"
|
||||
version = "1.6.0"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "chonkie-core" },
|
||||
|
|
@ -603,12 +605,15 @@ dependencies = [
|
|||
{ name = "tenacity" },
|
||||
{ name = "tqdm" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/a4/16/e51295955f5a627ebb7867dc2e7fa48d4c6dc2a5f3cde3690de84812e929/chonkie-1.5.6.tar.gz", hash = "sha256:282a24c20b88c4c28d8cae893ac78bcbee531a87d28ec86b419897a9eea2ecf3", size = 172066, upload-time = "2026-02-16T21:44:01.336Z" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/e5/72/fdf8f89ff439f4ec357af0866c819512391936e4e61b6f15635a48434b8a/chonkie-1.6.0.tar.gz", hash = "sha256:14120d80610c1f549027fc7aa9a5ff604a729b545836f6cadd65d5ae83596279", size = 187056, upload-time = "2026-03-11T04:55:07.657Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/18/3a/24cf4cb377f4d44126231d55a19b48a645a0f78f891288a8d4300c95160d/chonkie-1.5.6-py3-none-any.whl", hash = "sha256:4c3be39a0f97315eb3c5efe6dc5d7933d3d27a1918b55c39ab211b403bb03df7", size = 210065, upload-time = "2026-02-16T21:43:59.926Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/ae/c2/7ea7d3409df220dd0e048b1113b44f47eccab9d517b00b037ab0e34c3c7a/chonkie-1.6.0-py3-none-any.whl", hash = "sha256:aa357e02f5cdacac6f8280c5e8651207c866b4137bcf20904db8670ee0808877", size = 232997, upload-time = "2026-03-11T04:55:05.252Z" },
|
||||
]
|
||||
|
||||
[package.optional-dependencies]
|
||||
elastic = [
|
||||
{ name = "elasticsearch" },
|
||||
]
|
||||
semantic = [
|
||||
{ name = "model2vec" },
|
||||
{ name = "tokenizers" },
|
||||
|
|
@ -1061,6 +1066,14 @@ wheels = [
|
|||
{ url = "https://files.pythonhosted.org/packages/9c/0f/5d0c71a1aefeb08efff26272149e07ab922b64f46c63363756224bd6872e/filelock-3.24.3-py3-none-any.whl", hash = "sha256:426e9a4660391f7f8a810d71b0555bce9008b0a1cc342ab1f6947d37639e002d", size = 24331, upload-time = "2026-02-19T00:48:18.465Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "flatbuffers"
|
||||
version = "25.12.19"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/e8/2d/d2a548598be01649e2d46231d151a6c56d10b964d94043a335ae56ea2d92/flatbuffers-25.12.19-py2.py3-none-any.whl", hash = "sha256:7634f50c427838bb021c2d66a3d1168e9d199b0607e6329399f04846d42e20b4", size = 26661, upload-time = "2025-12-19T23:16:13.622Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "fqdn"
|
||||
version = "1.5.1"
|
||||
|
|
@ -3112,14 +3125,14 @@ wheels = [
|
|||
|
||||
[[package]]
|
||||
name = "opentelemetry-proto"
|
||||
version = "1.39.1"
|
||||
version = "1.40.0"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "protobuf" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/49/1d/f25d76d8260c156c40c97c9ed4511ec0f9ce353f8108ca6e7561f82a06b2/opentelemetry_proto-1.39.1.tar.gz", hash = "sha256:6c8e05144fc0d3ed4d22c2289c6b126e03bcd0e6a7da0f16cedd2e1c2772e2c8", size = 46152, upload-time = "2025-12-11T13:32:48.681Z" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/4c/77/dd38991db037fdfce45849491cb61de5ab000f49824a00230afb112a4392/opentelemetry_proto-1.40.0.tar.gz", hash = "sha256:03f639ca129ba513f5819810f5b1f42bcb371391405d99c168fe6937c62febcd", size = 45667, upload-time = "2026-03-04T14:17:31.194Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/51/95/b40c96a7b5203005a0b03d8ce8cd212ff23f1793d5ba289c87a097571b18/opentelemetry_proto-1.39.1-py3-none-any.whl", hash = "sha256:22cdc78efd3b3765d09e68bfbd010d4fc254c9818afd0b6b423387d9dee46007", size = 72535, upload-time = "2025-12-11T13:32:33.866Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/b9/b2/189b2577dde745b15625b3214302605b1353436219d42b7912e77fa8dc24/opentelemetry_proto-1.40.0-py3-none-any.whl", hash = "sha256:266c4385d88923a23d63e353e9761af0f47a6ed0d486979777fe4de59dc9b25f", size = 72073, upload-time = "2026-03-04T14:17:16.673Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
|
|
|||
Loading…
Reference in New Issue