Refactor: Move backend files to backend/ directory and split .gitignore

2026-02-11 17:40:44 -08:00
parent 48924affa0
commit 7a69133e26
96 changed files with 144 additions and 176 deletions
--- a/backend/src/ea_chatbot/graph/nodes/init.py
+++ b/backend/src/ea_chatbot/graph/nodes/init.py
--- a/backend/src/ea_chatbot/graph/nodes/clarification.py
+++ b/backend/src/ea_chatbot/graph/nodes/clarification.py
@@ -0,0 +1,45 @@
+from langchain_core.messages import AIMessage
+from ea_chatbot.graph.state import AgentState
+from ea_chatbot.config import Settings
+from ea_chatbot.utils.llm_factory import get_llm_model
+from ea_chatbot.utils.logging import get_logger, LangChainLoggingHandler
+
+def clarification_node(state: AgentState) -> dict:
+    """Ask the user for missing information or clarifications."""
+    question = state["question"]
+    analysis = state.get("analysis", {})
+    ambiguities = analysis.get("ambiguities", [])
+    
+    settings = Settings()
+    logger = get_logger("clarification")
+    
+    logger.info(f"Generating clarification for {len(ambiguities)} ambiguities.")
+    
+    llm = get_llm_model(
+        settings.query_analyzer_llm,
+        callbacks=[LangChainLoggingHandler(logger=logger)]
+    )
+    
+    system_prompt = """You are a Clarification Specialist. Your role is to identify what information is missing from a user's request to perform a data analysis or research task.
+Based on the analysis of the user's question, formulate a polite and concise request for the missing information."""
+
+    prompt = f"""Original Question: {question}
+Missing/Ambiguous Information: {', '.join(ambiguities) if ambiguities else 'Unknown ambiguities'}
+
+Please ask the user for the necessary details."""
+
+    messages = [
+        ("system", system_prompt),
+        ("user", prompt)
+    ]
+    
+    try:
+        response = llm.invoke(messages)
+        logger.info("[bold green]Clarification generated.[/bold green]")
+        return {
+            "messages": [response],
+            "next_action": "end" # To indicate we are done for now
+        }
+    except Exception as e:
+        logger.error(f"Failed to generate clarification: {str(e)}")
+        raise e
--- a/backend/src/ea_chatbot/graph/nodes/coder.py
+++ b/backend/src/ea_chatbot/graph/nodes/coder.py
@@ -0,0 +1,47 @@
+from ea_chatbot.graph.state import AgentState
+from ea_chatbot.config import Settings
+from ea_chatbot.utils.llm_factory import get_llm_model
+from ea_chatbot.utils import helpers, database_inspection
+from ea_chatbot.utils.logging import get_logger, LangChainLoggingHandler
+from ea_chatbot.graph.prompts.coder import CODE_GENERATOR_PROMPT
+from ea_chatbot.schemas import CodeGenerationResponse
+
+def coder_node(state: AgentState) -> dict:
+    """Generate Python code based on the plan and data summary."""
+    question = state["question"]
+    plan = state.get("plan", "")
+    code_output = state.get("code_output", "None")
+    
+    settings = Settings()
+    logger = get_logger("coder")
+    
+    logger.info("Generating Python code...")
+    
+    llm = get_llm_model(
+        settings.coder_llm,
+        callbacks=[LangChainLoggingHandler(logger=logger)]
+    )
+    structured_llm = llm.with_structured_output(CodeGenerationResponse)
+    
+    # Always provide data summary
+    database_description = database_inspection.get_data_summary(data_dir=settings.data_dir) or "No data available."
+    example_code = "" # Placeholder
+    
+    messages = CODE_GENERATOR_PROMPT.format_messages(
+        question=question,
+        plan=plan,
+        database_description=database_description,
+        code_exec_results=code_output,
+        example_code=example_code
+    )
+    
+    try:
+        response = structured_llm.invoke(messages)
+        logger.info("[bold green]Code generated.[/bold green]")
+        return {
+            "code": response.parsed_code,
+            "error": None # Clear previous errors on new code generation
+        }
+    except Exception as e:
+        logger.error(f"Failed to generate code: {str(e)}")
+        raise e
--- a/backend/src/ea_chatbot/graph/nodes/error_corrector.py
+++ b/backend/src/ea_chatbot/graph/nodes/error_corrector.py
@@ -0,0 +1,44 @@
+from ea_chatbot.graph.state import AgentState
+from ea_chatbot.config import Settings
+from ea_chatbot.utils.llm_factory import get_llm_model
+from ea_chatbot.utils.logging import get_logger, LangChainLoggingHandler
+from ea_chatbot.graph.prompts.coder import ERROR_CORRECTOR_PROMPT
+from ea_chatbot.schemas import CodeGenerationResponse
+
+def error_corrector_node(state: AgentState) -> dict:
+    """Fix the code based on the execution error."""
+    code = state.get("code", "")
+    error = state.get("error", "Unknown error")
+    
+    settings = Settings()
+    logger = get_logger("error_corrector")
+    
+    logger.warning(f"[bold red]Execution error detected:[/bold red] {error[:100]}...")
+    logger.info("Attempting to correct the code...")
+    
+    # Reuse coder LLM config or add a new one. Using coder_llm for now.
+    llm = get_llm_model(
+        settings.coder_llm,
+        callbacks=[LangChainLoggingHandler(logger=logger)]
+    )
+    structured_llm = llm.with_structured_output(CodeGenerationResponse)
+    
+    messages = ERROR_CORRECTOR_PROMPT.format_messages(
+        code=code,
+        error=error
+    )
+    
+    try:
+        response = structured_llm.invoke(messages)
+        logger.info("[bold green]Correction generated.[/bold green]")
+        
+        current_iterations = state.get("iterations", 0)
+        
+        return {
+            "code": response.parsed_code,
+            "error": None, # Clear error after fix attempt
+            "iterations": current_iterations + 1
+        }
+    except Exception as e:
+        logger.error(f"Failed to correct code: {str(e)}")
+        raise e
--- a/backend/src/ea_chatbot/graph/nodes/executor.py
+++ b/backend/src/ea_chatbot/graph/nodes/executor.py
@@ -0,0 +1,102 @@
+import io
+import sys
+import traceback
+from contextlib import redirect_stdout
+from typing import Any, Dict, List, TYPE_CHECKING
+import pandas as pd
+from matplotlib.figure import Figure
+
+from ea_chatbot.graph.state import AgentState
+from ea_chatbot.utils.db_client import DBClient
+from ea_chatbot.utils.logging import get_logger
+from ea_chatbot.config import Settings
+
+if TYPE_CHECKING:
+    from ea_chatbot.types import DBSettings
+
+def executor_node(state: AgentState) -> dict:
+    """Execute the Python code and capture output, plots, and dataframes."""
+    code = state.get("code")
+    logger = get_logger("executor")
+    
+    if not code:
+        logger.error("No code provided to executor.")
+        return {"error": "No code provided to executor."}
+
+    logger.info("Executing Python code...")
+    settings = Settings()
+    
+    db_settings: "DBSettings" = {
+        "host": settings.db_host,
+        "port": settings.db_port,
+        "user": settings.db_user,
+        "pswd": settings.db_pswd,
+        "db": settings.db_name,
+        "table": settings.db_table
+    }
+    
+    db_client = DBClient(settings=db_settings)
+    
+    # Initialize local variables for execution
+    # 'db' is the DBClient instance, 'plots' is for matplotlib figures
+    local_vars = {
+        'db': db_client,
+        'plots': [],
+        'pd': pd
+    }
+    
+    stdout_buffer = io.StringIO()
+    error = None
+    code_output = ""
+    plots = []
+    dfs = {}
+
+    try:
+        with redirect_stdout(stdout_buffer):
+            # Execute the code in the context of local_vars
+            exec(code, {}, local_vars)
+            
+        code_output = stdout_buffer.getvalue()
+        
+        # Limit the output length if it's too long
+        if code_output.count('\n') > 32:
+            code_output = '\n'.join(code_output.split('\n')[:32]) + '\n...'
+
+        # Extract plots
+        raw_plots = local_vars.get('plots', [])
+        if isinstance(raw_plots, list):
+            plots = [p for p in raw_plots if isinstance(p, Figure)]
+
+        # Extract DataFrames that were likely intended for display
+        # We look for DataFrames in local_vars that were mentioned in the code
+        for key, value in local_vars.items():
+            if isinstance(value, pd.DataFrame):
+                # Heuristic: if the variable name is in the code, it might be a result DF
+                if key in code:
+                    dfs[key] = value
+        
+        logger.info(f"[bold green]Execution complete.[/bold green] Captured {len(plots)} plots and {len(dfs)} dataframes.")
+
+    except Exception as e:
+        # Capture the traceback
+        exc_type, exc_value, tb = sys.exc_info()
+        full_traceback = traceback.format_exc()
+        
+        # Filter traceback to show only the relevant part (the executed string)
+        filtered_tb_lines = [line for line in full_traceback.split('\n') if '<string>' in line]
+        error = '\n'.join(filtered_tb_lines)
+        if error:
+            error += '\n'
+        error += f"{exc_type.__name__ if exc_type else 'Exception'}: {exc_value}"
+        
+        logger.error(f"Execution failed: {str(e)}")
+        
+        # If we have an error, we still might want to see partial stdout
+        code_output = stdout_buffer.getvalue()
+
+    return {
+        "code_output": code_output,
+        "error": error,
+        "plots": plots,
+        "dfs": dfs
+    }
--- a/backend/src/ea_chatbot/graph/nodes/planner.py
+++ b/backend/src/ea_chatbot/graph/nodes/planner.py
@@ -0,0 +1,51 @@
+import yaml
+from ea_chatbot.graph.state import AgentState
+from ea_chatbot.config import Settings
+from ea_chatbot.utils.llm_factory import get_llm_model
+from ea_chatbot.utils import helpers, database_inspection
+from ea_chatbot.utils.logging import get_logger, LangChainLoggingHandler
+from ea_chatbot.graph.prompts.planner import PLANNER_PROMPT
+from ea_chatbot.schemas import TaskPlanResponse
+
+def planner_node(state: AgentState) -> dict:
+    """Generate a structured plan based on the query analysis."""
+    question = state["question"]
+    history = state.get("messages", [])[-6:]
+    summary = state.get("summary", "")
+    
+    settings = Settings()
+    logger = get_logger("planner")
+    
+    logger.info("Generating task plan...")
+    
+    llm = get_llm_model(
+        settings.planner_llm,
+        callbacks=[LangChainLoggingHandler(logger=logger)]
+    )
+    structured_llm = llm.with_structured_output(TaskPlanResponse)
+    
+    date_str = helpers.get_readable_date()
+    
+    # Always provide data summary; LLM decides relevance.
+    database_description = database_inspection.get_data_summary(data_dir=settings.data_dir) or "No data available."
+    example_plan = ""
+    
+    messages = PLANNER_PROMPT.format_messages(
+        date=date_str,
+        question=question,
+        history=history,
+        summary=summary,
+        database_description=database_description,
+        example_plan=example_plan
+    )
+    
+    # Generate the structured plan
+    try:
+        response = structured_llm.invoke(messages)
+        # Convert the structured response back to YAML string for the state
+        plan_yaml = yaml.dump(response.model_dump(), sort_keys=False)
+        logger.info("[bold green]Plan generated successfully.[/bold green]")
+        return {"plan": plan_yaml}
+    except Exception as e:
+        logger.error(f"Failed to generate plan: {str(e)}")
+        raise e
--- a/backend/src/ea_chatbot/graph/nodes/query_analyzer.py
+++ b/backend/src/ea_chatbot/graph/nodes/query_analyzer.py
@@ -0,0 +1,73 @@
+from typing import List, Literal
+from pydantic import BaseModel, Field
+from ea_chatbot.graph.state import AgentState
+from ea_chatbot.config import Settings
+from ea_chatbot.utils.llm_factory import get_llm_model
+from ea_chatbot.utils.logging import get_logger, LangChainLoggingHandler
+from ea_chatbot.graph.prompts.query_analyzer import QUERY_ANALYZER_PROMPT
+
+class QueryAnalysis(BaseModel):
+    """Analysis of the user's query."""
+    data_required: List[str] = Field(description="List of data points or entities mentioned (e.g., ['2024 results', 'Florida']).")
+    unknowns: List[str] = Field(description="List of target information the user wants to know or needed for final answer (e.g., 'who won', 'total votes').")
+    ambiguities: List[str] = Field(description="List of CRITICAL missing details that prevent ANY analysis. Do NOT include database names or plot types if defaults can be used.")
+    conditions: List[str] = Field(description="List of any filters or constraints (e.g., ['year=2024', 'state=Florida']). Include context resolved from history.")
+    next_action: Literal["plan", "clarify", "research"] = Field(description="The next action to take. 'plan' for data analysis (even with defaults), 'research' for general knowledge, or 'clarify' ONLY for critical ambiguities.")
+
+def query_analyzer_node(state: AgentState) -> dict:
+    """Analyze the user's question and determine the next course of action."""
+    question = state["question"]
+    history = state.get("messages", [])
+    summary = state.get("summary", "")
+    
+    # Keep last 3 turns (6 messages)
+    history = history[-6:]
+    
+    settings = Settings()
+    logger = get_logger("query_analyzer")
+    
+    logger.info(f"Analyzing question: [italic]\"{question}\"[/italic]")
+    
+    # Initialize the LLM with structured output using the factory
+    # Pass logging callback to track LLM usage
+    llm = get_llm_model(
+        settings.query_analyzer_llm, 
+        callbacks=[LangChainLoggingHandler(logger=logger)]
+    )
+    structured_llm = llm.with_structured_output(QueryAnalysis)
+    
+    # Prepare messages using the prompt template
+    messages = QUERY_ANALYZER_PROMPT.format_messages(
+        question=question,
+        history=history,
+        summary=summary
+    )
+    
+    try:
+        # Invoke the structured LLM directly with the list of messages
+        analysis_result = structured_llm.invoke(messages)
+        analysis_result = QueryAnalysis.model_validate(analysis_result)
+        
+        analysis_dict = analysis_result.model_dump()
+        analysis_dict.pop("next_action")
+        next_action = analysis_result.next_action
+        
+        logger.info(f"Analysis complete. Next action: [bold magenta]{next_action}[/bold magenta]")
+        
+    except Exception as e:
+        logger.error(f"Error during query analysis: {str(e)}")
+        analysis_dict = {
+            "data_required": [],
+            "unknowns": [],
+            "ambiguities": [f"Error during analysis: {str(e)}"],
+            "conditions": []
+        }
+        next_action = "clarify"
+    
+    return {
+        "analysis": analysis_dict,
+        "next_action": next_action,
+        "iterations": 0
+    }
+        
+    
--- a/backend/src/ea_chatbot/graph/nodes/researcher.py
+++ b/backend/src/ea_chatbot/graph/nodes/researcher.py
@@ -0,0 +1,60 @@
+from langchain_core.messages import AIMessage
+from langchain_openai import ChatOpenAI
+from langchain_google_genai import ChatGoogleGenerativeAI
+from ea_chatbot.graph.state import AgentState
+from ea_chatbot.config import Settings
+from ea_chatbot.utils.llm_factory import get_llm_model
+from ea_chatbot.utils import helpers
+from ea_chatbot.utils.logging import get_logger, LangChainLoggingHandler
+from ea_chatbot.graph.prompts.researcher import RESEARCHER_PROMPT
+
+def researcher_node(state: AgentState) -> dict:
+    """Handle general research queries or web searches."""
+    question = state["question"]
+    history = state.get("messages", [])[-6:]
+    summary = state.get("summary", "")
+    
+    settings = Settings()
+    logger = get_logger("researcher")
+    
+    logger.info(f"Researching question: [italic]\"{question}\"[/italic]")
+    
+    # Use researcher_llm from settings
+    llm = get_llm_model(
+        settings.researcher_llm,
+        callbacks=[LangChainLoggingHandler(logger=logger)]
+    )
+    
+    date_str = helpers.get_readable_date()
+    
+    messages = RESEARCHER_PROMPT.format_messages(
+        date=date_str,
+        question=question,
+        history=history,
+        summary=summary
+    )
+    
+    # Provider-aware tool binding
+    try:
+        if isinstance(llm, ChatGoogleGenerativeAI):
+            # Native Google Search for Gemini
+            llm_with_tools = llm.bind_tools([{"google_search": {}}])
+        elif isinstance(llm, ChatOpenAI):
+            # Native Web Search for OpenAI (built-in tool)
+            llm_with_tools = llm.bind_tools([{"type": "web_search"}])
+        else:
+            # Fallback for other providers that might not support these specific search tools
+            llm_with_tools = llm
+    except Exception as e:
+        logger.warning(f"Failed to bind search tools: {str(e)}. Falling back to base LLM.")
+        llm_with_tools = llm
+    
+    try:
+        response = llm_with_tools.invoke(messages)
+        logger.info("[bold green]Research complete.[/bold green]")
+        return {
+            "messages": [response]
+        }
+    except Exception as e:
+        logger.error(f"Research failed: {str(e)}")
+        raise e
--- a/backend/src/ea_chatbot/graph/nodes/summarize_conversation.py
+++ b/backend/src/ea_chatbot/graph/nodes/summarize_conversation.py
@@ -0,0 +1,52 @@
+from langchain_core.messages import SystemMessage, HumanMessage
+from ea_chatbot.graph.state import AgentState
+from ea_chatbot.config import Settings
+from ea_chatbot.utils.llm_factory import get_llm_model
+from ea_chatbot.utils.logging import get_logger, LangChainLoggingHandler
+
+def summarize_conversation_node(state: AgentState) -> dict:
+    """Update the conversation summary based on the latest interaction."""
+    summary = state.get("summary", "")
+    messages = state.get("messages", [])
+    
+    # We only summarize if there are messages
+    if not messages:
+        return {}
+
+    # Get the last turn (User + Assistant)
+    last_turn = messages[-2:]
+    
+    settings = Settings()
+    logger = get_logger("summarize_conversation")
+    
+    logger.info("Updating conversation summary...")
+    
+    # Use summarizer_llm for this task as well
+    llm = get_llm_model(
+        settings.summarizer_llm,
+        callbacks=[LangChainLoggingHandler(logger=logger)]
+    )
+    
+    if summary:
+        prompt = (
+            f"This is a summary of the conversation so far: {summary}\n\n"
+            "Extend the summary by taking into account the new messages above."
+        )
+    else:
+        prompt = "Create a summary of the conversation above."
+    
+    # Construct the messages for the summarization LLM
+    summarization_messages = [
+        SystemMessage(content=f"Current summary: {summary}" if summary else "You are a helpful assistant that summarizes conversations."),
+        HumanMessage(content=f"Recent messages:\n{last_turn}\n\n{prompt}\n\nKeep the summary concise and focused on the key topics and data points discussed.")
+    ]
+    
+    try:
+        response = llm.invoke(summarization_messages)
+        new_summary = response.content
+        logger.info("[bold green]Conversation summary updated.[/bold green]")
+        return {"summary": new_summary}
+    except Exception as e:
+        logger.error(f"Failed to update summary: {str(e)}")
+        # If summarization fails, we keep the old one
+        return {"summary": summary}
--- a/backend/src/ea_chatbot/graph/nodes/summarizer.py
+++ b/backend/src/ea_chatbot/graph/nodes/summarizer.py
@@ -0,0 +1,44 @@
+from langchain_core.messages import AIMessage
+from ea_chatbot.graph.state import AgentState
+from ea_chatbot.config import Settings
+from ea_chatbot.utils.llm_factory import get_llm_model
+from ea_chatbot.utils.logging import get_logger, LangChainLoggingHandler
+from ea_chatbot.graph.prompts.summarizer import SUMMARIZER_PROMPT
+
+def summarizer_node(state: AgentState) -> dict:
+    """Summarize the code execution results into a final answer."""
+    question = state["question"]
+    plan = state.get("plan", "")
+    code_output = state.get("code_output", "")
+    history = state.get("messages", [])[-6:]
+    summary = state.get("summary", "")
+    
+    settings = Settings()
+    logger = get_logger("summarizer")
+    
+    logger.info("Generating final summary...")
+    
+    llm = get_llm_model(
+        settings.summarizer_llm,
+        callbacks=[LangChainLoggingHandler(logger=logger)]
+    )
+    
+    messages = SUMMARIZER_PROMPT.format_messages(
+        question=question,
+        plan=plan,
+        code_output=code_output,
+        history=history,
+        summary=summary
+    )
+    
+    try:
+        response = llm.invoke(messages)
+        logger.info("[bold green]Summary generated.[/bold green]")
+        
+        # Return the final message to be added to the state
+        return {
+            "messages": [response]
+        }
+    except Exception as e:
+        logger.error(f"Failed to generate summary: {str(e)}")
+        raise e