diff --git a/scratches/pseco/ingestion/Code Ingestion/n00 Proper Lark Chunking.ipynb b/scratches/pseco/ingestion/Code Ingestion/n00 Proper Lark Chunking.ipynb index 20981a1..01de0c6 100644 --- a/scratches/pseco/ingestion/Code Ingestion/n00 Proper Lark Chunking.ipynb +++ b/scratches/pseco/ingestion/Code Ingestion/n00 Proper Lark Chunking.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 5, "id": "0a8abbfa", "metadata": {}, "outputs": [ @@ -12,7 +12,7 @@ "True" ] }, - "execution_count": 1, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -24,7 +24,7 @@ "from dataclasses import dataclass\n", "from pathlib import Path\n", "from typing import Any, Dict, List, Optional, Tuple\n", - "from bnf import grammar\n", + "# from bnf import grammar\n", "import nltk\n", "from elasticsearch import Elasticsearch\n", "from langchain_core.documents import Document\n", @@ -33,26 +33,40 @@ "from lark import Lark, Token, Transformer, Tree\n", "from transformers import AutoConfig\n", "\n", - "from src.config import (DATA_DIR, ELASTICSEARCH_CODE_INDEX,\n", - " ELASTICSEARCH_DOCS_INDEX, ELASTICSEARCH_INDEX,\n", - " ELASTICSEARCH_URL, HF_EMB_MODEL_NAME,\n", - " OLLAMA_EMB_MODEL_NAME, OLLAMA_LOCAL_URL,\n", - " OLLAMA_MODEL_NAME, OLLAMA_URL, PROJ_ROOT)\n", + "from src.config import settings\n", "\n", "nltk.download(\"punkt\", quiet=True)" ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 6, "id": "5c9d292b", "metadata": {}, "outputs": [], "source": [ - "config = AutoConfig.from_pretrained(HF_EMB_MODEL_NAME)\n", + "config = AutoConfig.from_pretrained(settings.hf_emb_model_name)\n", "embedding_dim = config.hidden_size" ] }, + { + "cell_type": "code", + "execution_count": 9, + "id": "d2009c2b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "qwen3.5:2b\n" + ] + } + ], + "source": [ + "print(settings.ollama_model_name)" + ] + }, { "cell_type": "markdown", "id": "baa779f3", diff --git a/src/config.py b/src/config.py index fdb41d3..3bdffc6 100644 --- a/src/config.py +++ b/src/config.py @@ -1,39 +1,150 @@ from pathlib import Path +from pydantic_settings import BaseSettings, SettingsConfigDict +from pydantic import Field from dotenv import load_dotenv -import os +from datetime import timedelta +import warnings + load_dotenv() -OPENAI_API_KEY=os.getenv("OPENAI_API_KEY", "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") - -OLLAMA_URL=os.getenv("OLLAMA_URL", "http://host.docker.internal:11434") -OLLAMA_LOCAL_URL=os.getenv("OLLAMA_LOCAL_URL", "http://localhost:11434") -OLLAMA_MODEL_NAME=os.getenv("OLLAMA_MODEL_NAME", "qwen3-0.6B:latest") -OLLAMA_EMB_MODEL_NAME=os.getenv("OLLAMA_EMB_MODEL_NAME", "qwen3-0.6B-emb:latest") -ELASTICSEARCH_DOCS_INDEX = os.getenv("ELASTICSEARCH_DOCS_INDEX") -ELASTICSEARCH_CODE_INDEX = os.getenv("ELASTICSEARCH_CODE_INDEX") - -LANGFUSE_HOST=os.getenv("LANGFUSE_HOST", "http://45.77.119.180") -LANGFUSE_PUBLIC_KEY=os.getenv("LANGFUSE_PUBLIC_KEY", "pk-lf-0e6db694-3e95-4dd4-aedf-5a2694267058") -LANGFUSE_SECRET_KEY=os.getenv("LANGFUSE_SECRET_KEY", "sk-lf-dbf28bb9-15bb-4d03-a8c3-05caa3e3905f") - -ELASTICSEARCH_URL=os.getenv("ELASTICSEARCH_URL", "http://host.docker.internal:9200") -ELASTICSEARCH_LOCAL_URL=os.getenv("ELASTICSEARCH_LOCAL_URL", "http://localhost:9200") -ELASTICSEARCH_INDEX=os.getenv("ELASTICSEARCH_INDEX", "avap-docs-test") - -DATABASE_URL=os.getenv("DATABASE_URL", "postgresql://postgres:brunix_pass@host.docker.internal:5432/postgres") - -KUBECONFIG_PATH=os.getenv("KUBECONFIG_PATH", "kubernetes/kubeconfig.yaml") - -HF_TOKEN=os.getenv("HF_TOKEN", "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") -HF_EMB_MODEL_NAME=os.getenv("HF_EMB_MODEL_NAME", "Qwen/Qwen3-Embedding-0.6B") +class Settings(BaseSettings): + raw_path_: str + data_path_: str + processed_path_: str + models_path_: str + external_path_: str + kubeconfig_path: str + interim_path_: str + database_url: str + openai_api_key: str + elasticsearch_index: str + elasticsearch_docs_index: str + elasticsearch_code_index: str + llm_base_url: str + ollama_url: str + ollama_local_url: str + langfuse_host: str + elasticsearch_url: str + elasticsearch_local_url: str + ollama_model_name: str + ollama_emb_model_name: str + model_name: str + hf_emb_model_name: str + langfuse_public_key: str + langfuse_secret_key: str + hf_token: str -PROJ_ROOT = Path(__file__).resolve().parents[1] + model_config = SettingsConfigDict( + env_file=".env", + env_file_encoding="utf-8", + case_sensitive=False, + extra="ignore", + ) -DATA_DIR=PROJ_ROOT / "data" -MODELS_DIR=DATA_DIR / "models" -RAW_DIR=DATA_DIR / "raw" -PROCESSED_DIR=DATA_DIR / "processed" -INTERIM_DIR=DATA_DIR / "interim" -EXTERNAL_DIR=DATA_DIR / "external" -DOCS_DIR=PROJ_ROOT / "docs" \ No newline at end of file + @property + def data_path(self) -> Path: + return Path(self.data_path_) + + @property + def models_path(self) -> Path: + return Path(self.models_path_) + + @property + def processed_path(self) -> Path: + return Path(self.processed_path_) + + @property + def raw_path(self) -> Path: + return Path(self.raw_path_) + + @property + def interim_path(self) -> Path: + return Path(self.interim_path_) + + @property + def external_path(self) -> Path: + return Path(self.external_path_) + + @property + def proj_root(self) -> Path: + return Path(__file__).resolve().parents[1] + + @property + def database_url(self) -> str: + return self.database_url + + @property + def openai_api_key(self) -> str: + return self.openai_api_key + + @property + def elasticsearch_index(self) -> str: + return self.elasticsearch_index + + @property + def elasticsearch_docs_index(self) -> str: + return self.elasticsearch_docs_index + + @property + def elasticsearch_code_index(self) -> str: + return self.elasticsearch_code_index + + @property + def llm_base_url(self) -> str: + return self.llm_base_url + + @property + def ollama_url(self) -> str: + return self.ollama_url + + @property + def ollama_local_url(self) -> str: + return self.ollama_local_url + + @property + def langfuse_host(self) -> str: + return self.langfuse_host + + @property + def elasticsearch_url(self) -> str: + return self.elasticsearch_url + + @property + def elasticsearch_local_url(self) -> str: + return self.elasticsearch_local_url + + @property + def ollama_model_name(self) -> str: + return self.ollama_model_name + + @property + def ollama_emb_model_name(self) -> str: + return self.ollama_emb_model_name + + @property + def model_name(self) -> str: + return self.model_name + + @property + def hf_emb_model_name(self) -> str: + return self.hf_emb_model_name + + @property + def langfuse_public_key(self) -> str: + return self.langfuse_public_key + + @property + def langfuse_secret_key(self) -> str: + return self.langfuse_secret_key + + @property + def hf_token(self) -> str: + return self.hf_token + + @property + def kubeconfig_path(self) -> Path: + return Path(self.kubeconfig_path) + + +settings = Settings() \ No newline at end of file