- Refactored OIDC flow to implement PKCE, state/nonce validation, and BFF pattern. - Centralized configuration in Settings class (DEV_MODE, FRONTEND_URL, OIDC_REDIRECT_URI). - Updated auth routers to use conditional secure cookie flags based on DEV_MODE. - Modernized and cleaned up test suite by removing legacy Streamlit tests. - Fixed linting errors and unused imports across the backend.
196 lines
6.7 KiB
Python
196 lines
6.7 KiB
Python
import pytest
|
|
import pandas as pd
|
|
import os
|
|
from unittest.mock import MagicMock
|
|
from ea_chatbot.utils.database_inspection import get_primary_key, inspect_db_table, get_data_summary
|
|
|
|
@pytest.fixture
|
|
def mock_db_client():
|
|
mock_client = MagicMock()
|
|
mock_client.settings = {"table": "test_table"}
|
|
return mock_client
|
|
|
|
def test_get_primary_key(mock_db_client):
|
|
"""Test dynamic primary key discovery."""
|
|
# Mock response for primary key query
|
|
mock_df = pd.DataFrame({"column_name": ["my_pk"]})
|
|
mock_db_client.query_df.return_value = mock_df
|
|
|
|
pk = get_primary_key(mock_db_client, "test_table")
|
|
|
|
assert pk == "my_pk"
|
|
# Verify the query was called (at least once)
|
|
assert mock_db_client.query_df.called
|
|
|
|
def test_inspect_db_table_improved(mock_db_client, tmp_path):
|
|
"""Test improved inspect_db_table with cardinality and sampling."""
|
|
data_dir = str(tmp_path)
|
|
|
|
# 1. Mock columns and types
|
|
columns_df = pd.DataFrame({
|
|
"column_name": ["id", "category", "count"],
|
|
"data_type": ["integer", "text", "integer"]
|
|
})
|
|
|
|
# 2. Mock row count
|
|
total_rows_df = pd.DataFrame([{"count": 100}])
|
|
|
|
# 3. Mock PK discovery
|
|
pk_df = pd.DataFrame({"column_name": ["id"]})
|
|
|
|
# 4. Mock stats for columns
|
|
# We need to handle multiple calls to query_df
|
|
def side_effect(query):
|
|
if "information_schema.columns" in query:
|
|
return columns_df
|
|
if "COUNT(*)" in query:
|
|
return total_rows_df
|
|
if "information_schema.key_column_usage" in query:
|
|
return pk_df
|
|
|
|
# Category stats
|
|
if 'COUNT("category")' in query:
|
|
return pd.DataFrame([{"count": 100}])
|
|
if 'COUNT(DISTINCT "category")' in query:
|
|
return pd.DataFrame([{"count": 5}])
|
|
if 'SELECT DISTINCT "category"' in query:
|
|
return pd.DataFrame({"category": ["A", "B", "C", "D", "E"]})
|
|
|
|
# Count stats
|
|
if 'COUNT("count")' in query:
|
|
return pd.DataFrame([{"count": 100}])
|
|
if 'COUNT(DISTINCT "count")' in query:
|
|
return pd.DataFrame([{"count": 100}])
|
|
if 'AVG("count")' in query:
|
|
return pd.DataFrame([{"avg": 10.0, "min": 1, "max": 20}])
|
|
|
|
# ID stats (fix for IndexError)
|
|
if 'COUNT("id")' in query:
|
|
return pd.DataFrame([{"count": 100}])
|
|
if 'COUNT(DISTINCT "id")' in query:
|
|
return pd.DataFrame([{"count": 100}])
|
|
if 'AVG("id")' in query:
|
|
return pd.DataFrame([{"avg": 50.0, "min": 1, "max": 100}])
|
|
|
|
return pd.DataFrame()
|
|
mock_db_client.query_df.side_effect = side_effect
|
|
|
|
# Run inspection
|
|
inspect_db_table(mock_db_client, data_dir=data_dir)
|
|
|
|
# Read summary to verify
|
|
summary = get_data_summary(data_dir)
|
|
assert summary is not None
|
|
assert "test_table" in summary
|
|
assert "category" in summary
|
|
assert "distinct_values" in summary
|
|
assert "unique_count: 5" in summary
|
|
assert "- A" in summary
|
|
assert "- E" in summary
|
|
assert "primary_key: id" in summary
|
|
|
|
def test_get_data_summary_none(tmp_path):
|
|
"""Test get_data_summary when file doesn't exist."""
|
|
assert get_data_summary(str(tmp_path)) is None
|
|
|
|
def test_inspect_db_table_temporal(mock_db_client, tmp_path):
|
|
"""Test inspect_db_table with temporal columns."""
|
|
data_dir = str(tmp_path)
|
|
|
|
columns_df = pd.DataFrame({
|
|
"column_name": ["created_at"],
|
|
"data_type": ["timestamp without time zone"]
|
|
})
|
|
total_rows_df = pd.DataFrame([{"count": 50}])
|
|
pk_df = pd.DataFrame() # No PK
|
|
|
|
def side_effect(query):
|
|
if "information_schema.columns" in query:
|
|
return columns_df
|
|
if "COUNT(*)" in query:
|
|
return total_rows_df
|
|
if "information_schema.key_column_usage" in query:
|
|
return pk_df
|
|
if 'COUNT("created_at")' in query:
|
|
return pd.DataFrame([{"count": 50}])
|
|
if 'COUNT(DISTINCT "created_at")' in query:
|
|
return pd.DataFrame([{"count": 50}])
|
|
if 'MIN("created_at")' in query:
|
|
return pd.DataFrame([{"min": "2023-01-01", "max": "2023-12-31"}])
|
|
return pd.DataFrame()
|
|
|
|
mock_db_client.query_df.side_effect = side_effect
|
|
|
|
inspect_db_table(mock_db_client, data_dir=data_dir)
|
|
|
|
summary = get_data_summary(data_dir)
|
|
assert "created_at" in summary
|
|
assert "min: '2023-01-01'" in summary
|
|
assert "max: '2023-12-31'" in summary
|
|
|
|
def test_inspect_db_table_high_cardinality(mock_db_client, tmp_path):
|
|
"""Test inspect_db_table with high cardinality categorical column (no sample values)."""
|
|
data_dir = str(tmp_path)
|
|
|
|
columns_df = pd.DataFrame({
|
|
"column_name": ["user_id"],
|
|
"data_type": ["text"]
|
|
})
|
|
total_rows_df = pd.DataFrame([{"count": 100}])
|
|
pk_df = pd.DataFrame()
|
|
|
|
def side_effect(query):
|
|
if "information_schema.columns" in query:
|
|
return columns_df
|
|
if "COUNT(*)" in query:
|
|
return total_rows_df
|
|
if "information_schema.key_column_usage" in query:
|
|
return pk_df
|
|
if 'COUNT("user_id")' in query:
|
|
return pd.DataFrame([{"count": 100}])
|
|
if 'COUNT(DISTINCT "user_id")' in query:
|
|
# High cardinality > 20
|
|
return pd.DataFrame([{"count": 50}])
|
|
return pd.DataFrame()
|
|
|
|
mock_db_client.query_df.side_effect = side_effect
|
|
|
|
inspect_db_table(mock_db_client, data_dir=data_dir)
|
|
|
|
summary = get_data_summary(data_dir)
|
|
assert "user_id" in summary
|
|
assert "unique_count: 50" in summary
|
|
# Should NOT have distinct_values
|
|
assert "distinct_values" not in summary
|
|
|
|
def test_inspect_db_table_checksum_skip(mock_db_client, tmp_path):
|
|
"""Test that inspection is skipped if checksum matches."""
|
|
data_dir = str(tmp_path)
|
|
table = "test_table"
|
|
|
|
# 1. Create a fake checksum file
|
|
os.makedirs(data_dir, exist_ok=True)
|
|
# Checksum is md5 of "ins|upd|del". Let's say mock returns "my_hash"
|
|
|
|
# Mock checksum query
|
|
mock_db_client.query_df.return_value = pd.DataFrame([{"dml_hash": "my_hash"}])
|
|
|
|
# Write existing checksum
|
|
with open(os.path.join(data_dir, "checksum"), "w") as f:
|
|
f.write(f"{table}:my_hash\n")
|
|
|
|
# Write existing inspection
|
|
with open(os.path.join(data_dir, "inspection.yaml"), "w") as f:
|
|
f.write(f"{table}: {{ existing: true }}")
|
|
|
|
# Run inspection
|
|
result = inspect_db_table(mock_db_client, data_dir=data_dir)
|
|
|
|
# Should return existing content
|
|
assert "existing: true" in result
|
|
# query_df should be called ONLY for checksum (once)
|
|
# verify count of calls?
|
|
# Logic: 1 call for checksum. If match, return.
|
|
assert mock_db_client.query_df.call_count == 1
|
|
|