{ "cells": [ { "cell_type": "markdown", "id": "925b048c", "metadata": {}, "source": [ "# Libraries" ] }, { "cell_type": "code", "execution_count": 1, "id": "c3215835", "metadata": {}, "outputs": [], "source": [ "import json\n", "import pandas as pd\n", "import plotly.graph_objects as go\n", "import plotly.subplots as sp\n", "from pathlib import Path\n", "import numpy as np\n", "import yaml\n", "from scipy.stats import entropy as scipy_stats_entropy\n", "import yaml\n", "from scipy.stats import entropy as scipy_stats_entropy, entropy\n", "from src.config import settings\n", "\n", "base_path = Path(\"/home/pseco/VsCodeProjects/assistance-engine/output\")" ] }, { "cell_type": "markdown", "id": "c8024618", "metadata": {}, "source": [ "# Functions" ] }, { "cell_type": "code", "execution_count": 2, "id": "7e4b843f", "metadata": {}, "outputs": [], "source": [ "def calculate_kl_divergence(dataset_dist, prior_dist):\n", " \"\"\"\n", " Calculate KL divergence between dataset distribution and prior distribution.\n", " KL(P || Q) = sum(P(x) * log(P(x) / Q(x)))\n", " Both distributions should be probability distributions (sum to 1).\n", " \"\"\"\n", " # Get all unique node types\n", " all_nodes = set(dataset_dist.keys()) | set(prior_dist.keys())\n", " \n", " # Build aligned probability vectors with smoothing to avoid log(0)\n", " p_vec = np.array([dataset_dist.get(node, 1e-10) for node in sorted(all_nodes)])\n", " q_vec = np.array([prior_dist.get(node, 1e-10) for node in sorted(all_nodes)])\n", " \n", " # Normalize\n", " p_vec = p_vec / p_vec.sum()\n", " q_vec = q_vec / q_vec.sum()\n", " \n", " # Calculate KL divergence\n", " kl_div = np.sum(p_vec * (np.log(p_vec + 1e-10) - np.log(q_vec + 1e-10)))\n", " \n", " return kl_div" ] }, { "cell_type": "markdown", "id": "d8b63d88", "metadata": {}, "source": [ "# Read and Prepare Data" ] }, { "cell_type": "code", "execution_count": 3, "id": "d052f1bc", "metadata": {}, "outputs": [], "source": [ "candidates = {}\n", "\n", "with open(base_path / \"candidate_E_reward_10_coverage_stats.json\") as f:\n", " candidates[\"E\"] = json.load(f)\n", "\n", "with open(base_path / \"candidate_F_reward_10_coverage_stats.json\") as f:\n", " candidates[\"F\"] = json.load(f)\n", "\n", "with open(base_path / \"mbpp_avap_v2_reward_stats_A.json\") as f:\n", " candidates[\"A\"] = json.load(f)\n", "\n", "\n", "construct_map_path = settings.proj_root / \"construct_map.yaml\"\n", "\n", "with open(construct_map_path, 'r') as f:\n", " construct_map = yaml.safe_load(f)" ] }, { "cell_type": "code", "execution_count": 4, "id": "87c6af5b", "metadata": {}, "outputs": [], "source": [ "data_for_viz = {}\n", "\n", "for candidate_name, stats in candidates.items():\n", " node_freq = stats.get(\"node_type_frequency\", {})\n", " \n", " if node_freq:\n", " df = pd.DataFrame({\n", " \"node_type\": list(node_freq.keys()),\n", " \"frequency\": list(node_freq.values())\n", " }).sort_values(\"frequency\", ascending=False)\n", " else:\n", " df = pd.DataFrame({\n", " \"node_type\": [],\n", " \"frequency\": []\n", " })\n", " \n", " data_for_viz[candidate_name] = {\n", " \"dataframe\": df,\n", " \"entropy\": stats.get(\"distribution_entropy\", 0),\n", " \"total_nodes\": len(node_freq)\n", " }" ] }, { "cell_type": "markdown", "id": "399a5931", "metadata": {}, "source": [ "# KL Divergence Analysis: Dataset vs ConstructPrior" ] }, { "cell_type": "code", "execution_count": 5, "id": "725d9cd2", "metadata": {}, "outputs": [], "source": [ "colors = {\"A\": \"#1f77b4\", \"E\": \"#ff7f0e\", \"F\": \"#2ca02c\"}" ] }, { "cell_type": "code", "execution_count": null, "id": "d98606a2", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "ConstructPrior Metadata:\n", " Description: Auto-generated by construct_prior.py. Weights derived from real GitHub codebases via Python AST anal...\n", " Generated at: 2026-03-26T11:00:54.338542+00:00\n", " Source stats: {'github_files_analyzed': 100, 'github_files_fetched': 100, 'total_pair_cooccurrences': 441, 'total_trio_cooccurrences': 3821}\n", "\n", " Total AVAP node types in schema: 38\n", "\n", "ConstructPrior Statistics:\n", " Total node types (uniform prior): 38\n", " Prior probability per node: 0.026316\n", "\n", "All AVAP Node Types in Prior:\n", " node_type probability\n", "AddVariableToJSON 0.026316\n", " RequestGet 0.026316\n", " RequestPost 0.026316\n", " _status 0.026316\n", " addParam 0.026316\n", " addResult 0.026316\n", " addVar 0.026316\n", " avapConnector 0.026316\n", " else 0.026316\n", " encodeMD5 0.026316\n", " encodeSHA256 0.026316\n", " end 0.026316\n", " endLoop 0.026316\n", " exception 0.026316\n", " function 0.026316\n", " gather 0.026316\n", " getDateTime 0.026316\n", " getListLen 0.026316\n", "getQueryParamList 0.026316\n", " getTimeStamp 0.026316\n", " go 0.026316\n", " if_mode1 0.026316\n", " if_mode2 0.026316\n", " import 0.026316\n", " include 0.026316\n", " itemFromList 0.026316\n", " ormAccessInsert 0.026316\n", " ormAccessSelect 0.026316\n", " ormAccessUpdate 0.026316\n", " ormCheckTable 0.026316\n", " ormDirect 0.026316\n", " randomString 0.026316\n", " replace 0.026316\n", " return 0.026316\n", " stampToDatetime 0.026316\n", " startLoop 0.026316\n", " try 0.026316\n", " variableFromJSON 0.026316\n" ] } ], "source": [ "prior_info = construct_map.get('meta', {})\n", "print(\"ConstructPrior Metadata:\")\n", "print(f\" Description: {prior_info.get('description', 'N/A')[:100]}...\")\n", "print(f\" Generated at: {prior_info.get('generated_at', 'N/A')}\")\n", "print(f\" Source stats: {prior_info.get('source_stats', {})}\")\n", "\n", "avap_node_names = prior_info.get('avap_node_names', [])\n", "print(f\"\\n Total AVAP node types in schema: {len(avap_node_names)}\")\n", "\n", "prior_distribution = {node: 1.0 for node in avap_node_names}\n", "prior_total = len(avap_node_names)\n", "\n", "prior_probabilities = {k: 1.0 / prior_total for k in avap_node_names}\n", "\n", "print(f\"\\nConstructPrior Statistics:\")\n", "print(f\" Total node types (uniform prior): {len(prior_distribution)}\")\n", "print(f\" Prior probability per node: {1.0/prior_total:.6f}\")\n", "print(f\"\\nAll AVAP Node Types in Prior:\")\n", "prior_df = pd.DataFrame({\n", " \"node_type\": avap_node_names,\n", " \"probability\": [1.0/prior_total] * len(avap_node_names)\n", "}).sort_values(\"node_type\")\n", "\n", "print(prior_df.to_string(index=False))" ] }, { "cell_type": "code", "execution_count": 7, "id": "b6aeb3cb", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Candidate A:\n", " KL Divergence (Dataset || Prior): 0.200022\n", " Dataset entropy: 4.9590\n", "\n", "Candidate E:\n", " KL Divergence (Dataset || Prior): 0.670759\n", " Dataset entropy: 4.2800\n", "\n", "Candidate F:\n", " KL Divergence (Dataset || Prior): 0.823227\n", " Dataset entropy: 4.0600\n", "\n" ] } ], "source": [ "kl_divergences = {}\n", "\n", "for candidate_name in [\"A\", \"E\", \"F\"]:\n", " viz_data = data_for_viz[candidate_name]\n", " df = viz_data[\"dataframe\"]\n", " \n", " if len(df) > 0:\n", " dataset_total = df[\"frequency\"].sum()\n", " dataset_dist = {row[\"node_type\"]: row[\"frequency\"] / dataset_total \n", " for _, row in df.iterrows()}\n", " else:\n", " dataset_dist = {}\n", " \n", " kl_div = calculate_kl_divergence(dataset_dist, prior_probabilities)\n", " kl_divergences[candidate_name] = kl_div\n", " \n", " print(f\"Candidate {candidate_name}:\")\n", " print(f\" KL Divergence (Dataset || Prior): {kl_div:.6f}\")\n", " print(f\" Dataset entropy: {viz_data['entropy']:.4f}\")\n", " print()" ] }, { "cell_type": "code", "execution_count": 8, "id": "62bde9ab", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "================================================================================\n", "COMPREHENSIVE COMPARISON TABLE: Datasets vs ConstructPrior\n", "================================================================================\n", "Candidate KL Divergence Dataset Entropy Prior Entropy Node Types Total Frequency\n", " A 0.200022 4.959 3.637586 34 250\n", " E 0.670759 4.280 3.637586 21 29\n", " F 0.823227 4.060 3.637586 18 27\n", "================================================================================\n", "\n", "ConstructPrior Entropy: 3.6376\n", "ConstructPrior Node Types: 38\n", "ConstructPrior Total Frequency: 38\n" ] } ], "source": [ "comparison_table = {\n", " \"Candidate\": [],\n", " \"KL Divergence\": [],\n", " \"Dataset Entropy\": [],\n", " \"Prior Entropy\": [],\n", " \"Node Types\": [],\n", " \"Total Frequency\": []\n", "}\n", "\n", "prior_entropy = scipy_stats_entropy(list(prior_probabilities.values()))\n", "\n", "for candidate_name in [\"A\", \"E\", \"F\"]:\n", " viz_data = data_for_viz[candidate_name]\n", " df = viz_data[\"dataframe\"]\n", " \n", " comparison_table[\"Candidate\"].append(candidate_name)\n", " comparison_table[\"KL Divergence\"].append(kl_divergences[candidate_name])\n", " comparison_table[\"Dataset Entropy\"].append(viz_data[\"entropy\"])\n", " comparison_table[\"Prior Entropy\"].append(prior_entropy)\n", " comparison_table[\"Node Types\"].append(len(df))\n", " comparison_table[\"Total Frequency\"].append(df[\"frequency\"].sum() if len(df) > 0 else 0)\n", "\n", "comp_table_df = pd.DataFrame(comparison_table)\n", "\n", "print(\"=\"*80)\n", "print(\"COMPREHENSIVE COMPARISON TABLE: Datasets vs ConstructPrior\")\n", "print(\"=\"*80)\n", "print(comp_table_df.to_string(index=False))\n", "print(\"=\"*80)\n", "print(f\"\\nConstructPrior Entropy: {prior_entropy:.4f}\")\n", "print(f\"ConstructPrior Node Types: {len(prior_probabilities)}\")\n", "print(f\"ConstructPrior Total Frequency: {prior_total}\")" ] }, { "cell_type": "code", "execution_count": 9, "id": "2eb4246b", "metadata": {}, "outputs": [ { "data": { "text/html": [ " \n", " " ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.plotly.v1+json": { "config": { "plotlyServerURL": "https://plot.ly" }, "data": [ { "marker": { "color": [ "#1f77b4", "#ff7f0e", "#2ca02c" ] }, "name": "KL Divergence", "text": [ 0.200022, 0.670759, 0.823227 ], "textposition": "auto", "type": "bar", "x": [ "A", "E", "F" ], "xaxis": "x", "y": [ 0.2000219752539122, 0.6707593635118403, 0.8232269319388326 ], "yaxis": "y" }, { "marker": { "color": "#1f77b4" }, "name": "Dataset Entropy", "opacity": 0.7, "type": "bar", "x": [ "A", "E", "F" ], "xaxis": "x2", "y": [ 4.959, 4.28, 4.06 ], "yaxis": "y2" }, { "marker": { "color": "#ff7f0e" }, "name": "Prior Entropy", "opacity": 0.7, "type": "bar", "x": [ "A", "E", "F" ], "xaxis": "x2", "y": [ 3.6375861597263865, 3.6375861597263865, 3.6375861597263865 ], "yaxis": "y2" }, { "marker": { "color": [ "#1f77b4", "#ff7f0e", "#2ca02c" ], "line": { "color": "white", "width": 2 }, "size": 12 }, "mode": "markers+text", "name": "Candidates", "text": [ "A", "E", "F" ], "textposition": "top center", "type": "scatter", "x": [ 4.959, 4.28, 4.06 ], "xaxis": "x3", "y": [ 0.2000219752539122, 0.6707593635118403, 0.8232269319388326 ], "yaxis": "y3" }, { "marker": { "color": [ "#1f77b4", "#ff7f0e", "#2ca02c" ] }, "name": "Node Types", "text": [ 34, 21, 18 ], "textposition": "auto", "type": "bar", "x": [ "A", "E", "F" ], "xaxis": "x4", "y": [ 34, 21, 18 ], "yaxis": "y4" } ], "layout": { "annotations": [ { "font": { "size": 16 }, "showarrow": false, "text": "KL Divergence vs ConstructPrior", "x": 0.225, "xanchor": "center", "xref": "paper", "y": 1, "yanchor": "bottom", "yref": "paper" }, { "font": { "size": 16 }, "showarrow": false, "text": "Dataset vs Prior Entropy", "x": 0.775, "xanchor": "center", "xref": "paper", "y": 1, "yanchor": "bottom", "yref": "paper" }, { "font": { "size": 16 }, "showarrow": false, "text": "KL Divergence vs Dataset Entropy", "x": 0.225, "xanchor": "center", "xref": "paper", "y": 0.375, "yanchor": "bottom", "yref": "paper" }, { "font": { "size": 16 }, "showarrow": false, "text": "Node Type Coverage", "x": 0.775, "xanchor": "center", "xref": "paper", "y": 0.375, "yanchor": "bottom", "yref": "paper" }, { "showarrow": false, "text": "Prior: 38 types", "x": 1, "xanchor": "right", "xref": "x4 domain", "y": 38, "yanchor": "bottom", "yref": "y4" } ], "height": 800, "shapes": [ { "line": { "color": "red", "dash": "dash" }, "type": "line", "x0": 0, "x1": 1, "xref": "x4 domain", "y0": 38, "y1": 38, "yref": "y4" } ], "showlegend": true, "template": { "data": { "bar": [ { "error_x": { "color": "#2a3f5f" }, "error_y": { "color": "#2a3f5f" }, "marker": { "line": { "color": "#E5ECF6", "width": 0.5 }, "pattern": { "fillmode": "overlay", "size": 10, "solidity": 0.2 } }, "type": "bar" } ], "barpolar": [ { "marker": { "line": { "color": "#E5ECF6", "width": 0.5 }, "pattern": { "fillmode": "overlay", "size": 10, "solidity": 0.2 } }, "type": "barpolar" } ], "carpet": [ { "aaxis": { "endlinecolor": "#2a3f5f", "gridcolor": "white", "linecolor": "white", "minorgridcolor": "white", "startlinecolor": "#2a3f5f" }, "baxis": { "endlinecolor": "#2a3f5f", "gridcolor": "white", "linecolor": "white", "minorgridcolor": "white", "startlinecolor": "#2a3f5f" }, "type": "carpet" } ], "choropleth": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "type": "choropleth" } ], "contour": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "contour" } ], "contourcarpet": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "type": "contourcarpet" } ], "heatmap": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "heatmap" } ], "heatmapgl": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "heatmapgl" } ], "histogram": [ { "marker": { "pattern": { "fillmode": "overlay", "size": 10, "solidity": 0.2 } }, "type": "histogram" } ], "histogram2d": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "histogram2d" } ], "histogram2dcontour": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "histogram2dcontour" } ], "mesh3d": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "type": "mesh3d" } ], "parcoords": [ { "line": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "parcoords" } ], "pie": [ { "automargin": true, "type": "pie" } ], "scatter": [ { "fillpattern": { "fillmode": "overlay", "size": 10, "solidity": 0.2 }, "type": "scatter" } ], "scatter3d": [ { "line": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatter3d" } ], "scattercarpet": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattercarpet" } ], "scattergeo": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattergeo" } ], "scattergl": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattergl" } ], "scattermapbox": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattermapbox" } ], "scatterpolar": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatterpolar" } ], "scatterpolargl": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatterpolargl" } ], "scatterternary": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatterternary" } ], "surface": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "surface" } ], "table": [ { "cells": { "fill": { "color": "#EBF0F8" }, "line": { "color": "white" } }, "header": { "fill": { "color": "#C8D4E3" }, "line": { "color": "white" } }, "type": "table" } ] }, "layout": { "annotationdefaults": { "arrowcolor": "#2a3f5f", "arrowhead": 0, "arrowwidth": 1 }, "autotypenumbers": "strict", "coloraxis": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "colorscale": { "diverging": [ [ 0, "#8e0152" ], [ 0.1, "#c51b7d" ], [ 0.2, "#de77ae" ], [ 0.3, "#f1b6da" ], [ 0.4, "#fde0ef" ], [ 0.5, "#f7f7f7" ], [ 0.6, "#e6f5d0" ], [ 0.7, "#b8e186" ], [ 0.8, "#7fbc41" ], [ 0.9, "#4d9221" ], [ 1, "#276419" ] ], "sequential": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "sequentialminus": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ] }, "colorway": [ "#636efa", "#EF553B", "#00cc96", "#ab63fa", "#FFA15A", "#19d3f3", "#FF6692", "#B6E880", "#FF97FF", "#FECB52" ], "font": { "color": "#2a3f5f" }, "geo": { "bgcolor": "white", "lakecolor": "white", "landcolor": "#E5ECF6", "showlakes": true, "showland": true, "subunitcolor": "white" }, "hoverlabel": { "align": "left" }, "hovermode": "closest", "mapbox": { "style": "light" }, "paper_bgcolor": "white", "plot_bgcolor": "#E5ECF6", "polar": { "angularaxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" }, "bgcolor": "#E5ECF6", "radialaxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" } }, "scene": { "xaxis": { "backgroundcolor": "#E5ECF6", "gridcolor": "white", "gridwidth": 2, "linecolor": "white", "showbackground": true, "ticks": "", "zerolinecolor": "white" }, "yaxis": { "backgroundcolor": "#E5ECF6", "gridcolor": "white", "gridwidth": 2, "linecolor": "white", "showbackground": true, "ticks": "", "zerolinecolor": "white" }, "zaxis": { "backgroundcolor": "#E5ECF6", "gridcolor": "white", "gridwidth": 2, "linecolor": "white", "showbackground": true, "ticks": "", "zerolinecolor": "white" } }, "shapedefaults": { "line": { "color": "#2a3f5f" } }, "ternary": { "aaxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" }, "baxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" }, "bgcolor": "#E5ECF6", "caxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" } }, "title": { "x": 0.05 }, "xaxis": { "automargin": true, "gridcolor": "white", "linecolor": "white", "ticks": "", "title": { "standoff": 15 }, "zerolinecolor": "white", "zerolinewidth": 2 }, "yaxis": { "automargin": true, "gridcolor": "white", "linecolor": "white", "ticks": "", "title": { "standoff": 15 }, "zerolinecolor": "white", "zerolinewidth": 2 } } }, "title": { "text": "Comprehensive KL Divergence Analysis: Datasets vs ConstructPrior" }, "xaxis": { "anchor": "y", "domain": [ 0, 0.45 ] }, "xaxis2": { "anchor": "y2", "domain": [ 0.55, 1 ] }, "xaxis3": { "anchor": "y3", "domain": [ 0, 0.45 ], "title": { "text": "Dataset Entropy" } }, "xaxis4": { "anchor": "y4", "domain": [ 0.55, 1 ] }, "yaxis": { "anchor": "x", "domain": [ 0.625, 1 ], "title": { "text": "KL Divergence" } }, "yaxis2": { "anchor": "x2", "domain": [ 0.625, 1 ], "title": { "text": "Entropy" } }, "yaxis3": { "anchor": "x3", "domain": [ 0, 0.375 ], "title": { "text": "KL Divergence" } }, "yaxis4": { "anchor": "x4", "domain": [ 0, 0.375 ], "title": { "text": "# Node Types" } } } }, "text/html": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "fig = sp.make_subplots(\n", " rows=2, cols=2,\n", " subplot_titles=(\n", " \"KL Divergence vs ConstructPrior\",\n", " \"Dataset vs Prior Entropy\",\n", " \"KL Divergence vs Dataset Entropy\",\n", " \"Node Type Coverage\"\n", " ),\n", " specs=[\n", " [{\"type\": \"bar\"}, {\"type\": \"bar\"}],\n", " [{\"type\": \"scatter\"}, {\"type\": \"bar\"}]\n", " ]\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " x=comp_table_df[\"Candidate\"],\n", " y=comp_table_df[\"KL Divergence\"],\n", " marker_color=[\"#1f77b4\", \"#ff7f0e\", \"#2ca02c\"],\n", " name=\"KL Divergence\",\n", " text=comp_table_df[\"KL Divergence\"].round(6),\n", " textposition=\"auto\"\n", " ),\n", " row=1, col=1\n", ")\n", "\n", "x_pos = np.arange(len(comp_table_df))\n", "width = 0.35\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " x=comp_table_df[\"Candidate\"],\n", " y=comp_table_df[\"Dataset Entropy\"],\n", " name=\"Dataset Entropy\",\n", " marker_color=\"#1f77b4\",\n", " opacity=0.7\n", " ),\n", " row=1, col=2\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " x=comp_table_df[\"Candidate\"],\n", " y=[prior_entropy] * len(comp_table_df),\n", " name=\"Prior Entropy\",\n", " marker_color=\"#ff7f0e\",\n", " opacity=0.7\n", " ),\n", " row=1, col=2\n", ")\n", "\n", "fig.add_trace(\n", " go.Scatter(\n", " x=comp_table_df[\"Dataset Entropy\"],\n", " y=comp_table_df[\"KL Divergence\"],\n", " mode='markers+text',\n", " marker=dict(\n", " size=12,\n", " color=[\"#1f77b4\", \"#ff7f0e\", \"#2ca02c\"],\n", " line=dict(width=2, color=\"white\")\n", " ),\n", " text=comp_table_df[\"Candidate\"],\n", " textposition=\"top center\",\n", " name=\"Candidates\"\n", " ),\n", " row=2, col=1\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " x=comp_table_df[\"Candidate\"],\n", " y=comp_table_df[\"Node Types\"],\n", " marker_color=[\"#1f77b4\", \"#ff7f0e\", \"#2ca02c\"],\n", " text=comp_table_df[\"Node Types\"],\n", " textposition=\"auto\",\n", " name=\"Node Types\"\n", " ),\n", " row=2, col=2\n", ")\n", "\n", "fig.add_hline(\n", " y=len(prior_probabilities),\n", " line_dash=\"dash\",\n", " line_color=\"red\",\n", " annotation_text=f\"Prior: {len(prior_probabilities)} types\",\n", " row=2, col=2\n", ")\n", "\n", "fig.update_yaxes(title_text=\"KL Divergence\", row=1, col=1)\n", "fig.update_yaxes(title_text=\"Entropy\", row=1, col=2)\n", "fig.update_xaxes(title_text=\"Dataset Entropy\", row=2, col=1)\n", "fig.update_yaxes(title_text=\"KL Divergence\", row=2, col=1)\n", "fig.update_yaxes(title_text=\"# Node Types\", row=2, col=2)\n", "\n", "fig.update_layout(\n", " title_text=\"Comprehensive KL Divergence Analysis: Datasets vs ConstructPrior\",\n", " height=800,\n", " showlegend=True\n", ")\n", "\n", "fig.show()" ] } ], "metadata": { "kernelspec": { "display_name": ".venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.11" } }, "nbformat": 4, "nbformat_minor": 5 }