Refactor: Move backend files to backend/ directory and split .gitignore
This commit is contained in:
195
backend/tests/test_database_inspection.py
Normal file
195
backend/tests/test_database_inspection.py
Normal file
@@ -0,0 +1,195 @@
|
||||
import pytest
|
||||
import pandas as pd
|
||||
import os
|
||||
from unittest.mock import MagicMock, patch
|
||||
from ea_chatbot.utils.database_inspection import get_primary_key, inspect_db_table, get_data_summary
|
||||
|
||||
@pytest.fixture
|
||||
def mock_db_client():
|
||||
mock_client = MagicMock()
|
||||
mock_client.settings = {"table": "test_table"}
|
||||
return mock_client
|
||||
|
||||
def test_get_primary_key(mock_db_client):
|
||||
"""Test dynamic primary key discovery."""
|
||||
# Mock response for primary key query
|
||||
mock_df = pd.DataFrame({"column_name": ["my_pk"]})
|
||||
mock_db_client.query_df.return_value = mock_df
|
||||
|
||||
pk = get_primary_key(mock_db_client, "test_table")
|
||||
|
||||
assert pk == "my_pk"
|
||||
# Verify the query was called (at least once)
|
||||
assert mock_db_client.query_df.called
|
||||
|
||||
def test_inspect_db_table_improved(mock_db_client, tmp_path):
|
||||
"""Test improved inspect_db_table with cardinality and sampling."""
|
||||
data_dir = str(tmp_path)
|
||||
|
||||
# 1. Mock columns and types
|
||||
columns_df = pd.DataFrame({
|
||||
"column_name": ["id", "category", "count"],
|
||||
"data_type": ["integer", "text", "integer"]
|
||||
})
|
||||
|
||||
# 2. Mock row count
|
||||
total_rows_df = pd.DataFrame([{"count": 100}])
|
||||
|
||||
# 3. Mock PK discovery
|
||||
pk_df = pd.DataFrame({"column_name": ["id"]})
|
||||
|
||||
# 4. Mock stats for columns
|
||||
# We need to handle multiple calls to query_df
|
||||
def side_effect(query):
|
||||
if "information_schema.columns" in query:
|
||||
return columns_df
|
||||
if "COUNT(*)" in query:
|
||||
return total_rows_df
|
||||
if "information_schema.key_column_usage" in query:
|
||||
return pk_df
|
||||
|
||||
# Category stats
|
||||
if 'COUNT("category")' in query:
|
||||
return pd.DataFrame([{"count": 100}])
|
||||
if 'COUNT(DISTINCT "category")' in query:
|
||||
return pd.DataFrame([{"count": 5}])
|
||||
if 'SELECT DISTINCT "category"' in query:
|
||||
return pd.DataFrame({"category": ["A", "B", "C", "D", "E"]})
|
||||
|
||||
# Count stats
|
||||
if 'COUNT("count")' in query:
|
||||
return pd.DataFrame([{"count": 100}])
|
||||
if 'COUNT(DISTINCT "count")' in query:
|
||||
return pd.DataFrame([{"count": 100}])
|
||||
if 'AVG("count")' in query:
|
||||
return pd.DataFrame([{"avg": 10.0, "min": 1, "max": 20}])
|
||||
|
||||
# ID stats (fix for IndexError)
|
||||
if 'COUNT("id")' in query:
|
||||
return pd.DataFrame([{"count": 100}])
|
||||
if 'COUNT(DISTINCT "id")' in query:
|
||||
return pd.DataFrame([{"count": 100}])
|
||||
if 'AVG("id")' in query:
|
||||
return pd.DataFrame([{"avg": 50.0, "min": 1, "max": 100}])
|
||||
|
||||
return pd.DataFrame()
|
||||
mock_db_client.query_df.side_effect = side_effect
|
||||
|
||||
# Run inspection
|
||||
inspect_db_table(mock_db_client, data_dir=data_dir)
|
||||
|
||||
# Read summary to verify
|
||||
summary = get_data_summary(data_dir)
|
||||
assert summary is not None
|
||||
assert "test_table" in summary
|
||||
assert "category" in summary
|
||||
assert "distinct_values" in summary
|
||||
assert "unique_count: 5" in summary
|
||||
assert "- A" in summary
|
||||
assert "- E" in summary
|
||||
assert "primary_key: id" in summary
|
||||
|
||||
def test_get_data_summary_none(tmp_path):
|
||||
"""Test get_data_summary when file doesn't exist."""
|
||||
assert get_data_summary(str(tmp_path)) is None
|
||||
|
||||
def test_inspect_db_table_temporal(mock_db_client, tmp_path):
|
||||
"""Test inspect_db_table with temporal columns."""
|
||||
data_dir = str(tmp_path)
|
||||
|
||||
columns_df = pd.DataFrame({
|
||||
"column_name": ["created_at"],
|
||||
"data_type": ["timestamp without time zone"]
|
||||
})
|
||||
total_rows_df = pd.DataFrame([{"count": 50}])
|
||||
pk_df = pd.DataFrame() # No PK
|
||||
|
||||
def side_effect(query):
|
||||
if "information_schema.columns" in query:
|
||||
return columns_df
|
||||
if "COUNT(*)" in query:
|
||||
return total_rows_df
|
||||
if "information_schema.key_column_usage" in query:
|
||||
return pk_df
|
||||
if 'COUNT("created_at")' in query:
|
||||
return pd.DataFrame([{"count": 50}])
|
||||
if 'COUNT(DISTINCT "created_at")' in query:
|
||||
return pd.DataFrame([{"count": 50}])
|
||||
if 'MIN("created_at")' in query:
|
||||
return pd.DataFrame([{"min": "2023-01-01", "max": "2023-12-31"}])
|
||||
return pd.DataFrame()
|
||||
|
||||
mock_db_client.query_df.side_effect = side_effect
|
||||
|
||||
inspect_db_table(mock_db_client, data_dir=data_dir)
|
||||
|
||||
summary = get_data_summary(data_dir)
|
||||
assert "created_at" in summary
|
||||
assert "min: '2023-01-01'" in summary
|
||||
assert "max: '2023-12-31'" in summary
|
||||
|
||||
def test_inspect_db_table_high_cardinality(mock_db_client, tmp_path):
|
||||
"""Test inspect_db_table with high cardinality categorical column (no sample values)."""
|
||||
data_dir = str(tmp_path)
|
||||
|
||||
columns_df = pd.DataFrame({
|
||||
"column_name": ["user_id"],
|
||||
"data_type": ["text"]
|
||||
})
|
||||
total_rows_df = pd.DataFrame([{"count": 100}])
|
||||
pk_df = pd.DataFrame()
|
||||
|
||||
def side_effect(query):
|
||||
if "information_schema.columns" in query:
|
||||
return columns_df
|
||||
if "COUNT(*)" in query:
|
||||
return total_rows_df
|
||||
if "information_schema.key_column_usage" in query:
|
||||
return pk_df
|
||||
if 'COUNT("user_id")' in query:
|
||||
return pd.DataFrame([{"count": 100}])
|
||||
if 'COUNT(DISTINCT "user_id")' in query:
|
||||
# High cardinality > 20
|
||||
return pd.DataFrame([{"count": 50}])
|
||||
return pd.DataFrame()
|
||||
|
||||
mock_db_client.query_df.side_effect = side_effect
|
||||
|
||||
inspect_db_table(mock_db_client, data_dir=data_dir)
|
||||
|
||||
summary = get_data_summary(data_dir)
|
||||
assert "user_id" in summary
|
||||
assert "unique_count: 50" in summary
|
||||
# Should NOT have distinct_values
|
||||
assert "distinct_values" not in summary
|
||||
|
||||
def test_inspect_db_table_checksum_skip(mock_db_client, tmp_path):
|
||||
"""Test that inspection is skipped if checksum matches."""
|
||||
data_dir = str(tmp_path)
|
||||
table = "test_table"
|
||||
|
||||
# 1. Create a fake checksum file
|
||||
os.makedirs(data_dir, exist_ok=True)
|
||||
# Checksum is md5 of "ins|upd|del". Let's say mock returns "my_hash"
|
||||
|
||||
# Mock checksum query
|
||||
mock_db_client.query_df.return_value = pd.DataFrame([{"dml_hash": "my_hash"}])
|
||||
|
||||
# Write existing checksum
|
||||
with open(os.path.join(data_dir, "checksum"), "w") as f:
|
||||
f.write(f"{table}:my_hash\n")
|
||||
|
||||
# Write existing inspection
|
||||
with open(os.path.join(data_dir, "inspection.yaml"), "w") as f:
|
||||
f.write(f"{table}: {{ existing: true }}")
|
||||
|
||||
# Run inspection
|
||||
result = inspect_db_table(mock_db_client, data_dir=data_dir)
|
||||
|
||||
# Should return existing content
|
||||
assert "existing: true" in result
|
||||
# query_df should be called ONLY for checksum (once)
|
||||
# verify count of calls?
|
||||
# Logic: 1 call for checksum. If match, return.
|
||||
assert mock_db_client.query_df.call_count == 1
|
||||
|
||||
Reference in New Issue
Block a user