Files
ea-chatbot-lg/backend/tests/test_database_inspection.py

196 lines
6.7 KiB
Python

import pytest
import pandas as pd
import os
from unittest.mock import MagicMock, patch
from ea_chatbot.utils.database_inspection import get_primary_key, inspect_db_table, get_data_summary
@pytest.fixture
def mock_db_client():
mock_client = MagicMock()
mock_client.settings = {"table": "test_table"}
return mock_client
def test_get_primary_key(mock_db_client):
"""Test dynamic primary key discovery."""
# Mock response for primary key query
mock_df = pd.DataFrame({"column_name": ["my_pk"]})
mock_db_client.query_df.return_value = mock_df
pk = get_primary_key(mock_db_client, "test_table")
assert pk == "my_pk"
# Verify the query was called (at least once)
assert mock_db_client.query_df.called
def test_inspect_db_table_improved(mock_db_client, tmp_path):
"""Test improved inspect_db_table with cardinality and sampling."""
data_dir = str(tmp_path)
# 1. Mock columns and types
columns_df = pd.DataFrame({
"column_name": ["id", "category", "count"],
"data_type": ["integer", "text", "integer"]
})
# 2. Mock row count
total_rows_df = pd.DataFrame([{"count": 100}])
# 3. Mock PK discovery
pk_df = pd.DataFrame({"column_name": ["id"]})
# 4. Mock stats for columns
# We need to handle multiple calls to query_df
def side_effect(query):
if "information_schema.columns" in query:
return columns_df
if "COUNT(*)" in query:
return total_rows_df
if "information_schema.key_column_usage" in query:
return pk_df
# Category stats
if 'COUNT("category")' in query:
return pd.DataFrame([{"count": 100}])
if 'COUNT(DISTINCT "category")' in query:
return pd.DataFrame([{"count": 5}])
if 'SELECT DISTINCT "category"' in query:
return pd.DataFrame({"category": ["A", "B", "C", "D", "E"]})
# Count stats
if 'COUNT("count")' in query:
return pd.DataFrame([{"count": 100}])
if 'COUNT(DISTINCT "count")' in query:
return pd.DataFrame([{"count": 100}])
if 'AVG("count")' in query:
return pd.DataFrame([{"avg": 10.0, "min": 1, "max": 20}])
# ID stats (fix for IndexError)
if 'COUNT("id")' in query:
return pd.DataFrame([{"count": 100}])
if 'COUNT(DISTINCT "id")' in query:
return pd.DataFrame([{"count": 100}])
if 'AVG("id")' in query:
return pd.DataFrame([{"avg": 50.0, "min": 1, "max": 100}])
return pd.DataFrame()
mock_db_client.query_df.side_effect = side_effect
# Run inspection
inspect_db_table(mock_db_client, data_dir=data_dir)
# Read summary to verify
summary = get_data_summary(data_dir)
assert summary is not None
assert "test_table" in summary
assert "category" in summary
assert "distinct_values" in summary
assert "unique_count: 5" in summary
assert "- A" in summary
assert "- E" in summary
assert "primary_key: id" in summary
def test_get_data_summary_none(tmp_path):
"""Test get_data_summary when file doesn't exist."""
assert get_data_summary(str(tmp_path)) is None
def test_inspect_db_table_temporal(mock_db_client, tmp_path):
"""Test inspect_db_table with temporal columns."""
data_dir = str(tmp_path)
columns_df = pd.DataFrame({
"column_name": ["created_at"],
"data_type": ["timestamp without time zone"]
})
total_rows_df = pd.DataFrame([{"count": 50}])
pk_df = pd.DataFrame() # No PK
def side_effect(query):
if "information_schema.columns" in query:
return columns_df
if "COUNT(*)" in query:
return total_rows_df
if "information_schema.key_column_usage" in query:
return pk_df
if 'COUNT("created_at")' in query:
return pd.DataFrame([{"count": 50}])
if 'COUNT(DISTINCT "created_at")' in query:
return pd.DataFrame([{"count": 50}])
if 'MIN("created_at")' in query:
return pd.DataFrame([{"min": "2023-01-01", "max": "2023-12-31"}])
return pd.DataFrame()
mock_db_client.query_df.side_effect = side_effect
inspect_db_table(mock_db_client, data_dir=data_dir)
summary = get_data_summary(data_dir)
assert "created_at" in summary
assert "min: '2023-01-01'" in summary
assert "max: '2023-12-31'" in summary
def test_inspect_db_table_high_cardinality(mock_db_client, tmp_path):
"""Test inspect_db_table with high cardinality categorical column (no sample values)."""
data_dir = str(tmp_path)
columns_df = pd.DataFrame({
"column_name": ["user_id"],
"data_type": ["text"]
})
total_rows_df = pd.DataFrame([{"count": 100}])
pk_df = pd.DataFrame()
def side_effect(query):
if "information_schema.columns" in query:
return columns_df
if "COUNT(*)" in query:
return total_rows_df
if "information_schema.key_column_usage" in query:
return pk_df
if 'COUNT("user_id")' in query:
return pd.DataFrame([{"count": 100}])
if 'COUNT(DISTINCT "user_id")' in query:
# High cardinality > 20
return pd.DataFrame([{"count": 50}])
return pd.DataFrame()
mock_db_client.query_df.side_effect = side_effect
inspect_db_table(mock_db_client, data_dir=data_dir)
summary = get_data_summary(data_dir)
assert "user_id" in summary
assert "unique_count: 50" in summary
# Should NOT have distinct_values
assert "distinct_values" not in summary
def test_inspect_db_table_checksum_skip(mock_db_client, tmp_path):
"""Test that inspection is skipped if checksum matches."""
data_dir = str(tmp_path)
table = "test_table"
# 1. Create a fake checksum file
os.makedirs(data_dir, exist_ok=True)
# Checksum is md5 of "ins|upd|del". Let's say mock returns "my_hash"
# Mock checksum query
mock_db_client.query_df.return_value = pd.DataFrame([{"dml_hash": "my_hash"}])
# Write existing checksum
with open(os.path.join(data_dir, "checksum"), "w") as f:
f.write(f"{table}:my_hash\n")
# Write existing inspection
with open(os.path.join(data_dir, "inspection.yaml"), "w") as f:
f.write(f"{table}: {{ existing: true }}")
# Run inspection
result = inspect_db_table(mock_db_client, data_dir=data_dir)
# Should return existing content
assert "existing: true" in result
# query_df should be called ONLY for checksum (once)
# verify count of calls?
# Logic: 1 call for checksum. If match, return.
assert mock_db_client.query_df.call_count == 1