import pytest import pandas as pd import os from unittest.mock import MagicMock from ea_chatbot.utils.database_inspection import get_primary_key, inspect_db_table, get_data_summary @pytest.fixture def mock_db_client(): mock_client = MagicMock() mock_client.settings = {"table": "test_table"} return mock_client def test_get_primary_key(mock_db_client): """Test dynamic primary key discovery.""" # Mock response for primary key query mock_df = pd.DataFrame({"column_name": ["my_pk"]}) mock_db_client.query_df.return_value = mock_df pk = get_primary_key(mock_db_client, "test_table") assert pk == "my_pk" # Verify the query was called (at least once) assert mock_db_client.query_df.called def test_inspect_db_table_improved(mock_db_client, tmp_path): """Test improved inspect_db_table with cardinality and sampling.""" data_dir = str(tmp_path) # 1. Mock columns and types columns_df = pd.DataFrame({ "column_name": ["id", "category", "count"], "data_type": ["integer", "text", "integer"] }) # 2. Mock row count total_rows_df = pd.DataFrame([{"count": 100}]) # 3. Mock PK discovery pk_df = pd.DataFrame({"column_name": ["id"]}) # 4. Mock stats for columns # We need to handle multiple calls to query_df def side_effect(query): if "information_schema.columns" in query: return columns_df if "COUNT(*)" in query: return total_rows_df if "information_schema.key_column_usage" in query: return pk_df # Category stats if 'COUNT("category")' in query: return pd.DataFrame([{"count": 100}]) if 'COUNT(DISTINCT "category")' in query: return pd.DataFrame([{"count": 5}]) if 'SELECT DISTINCT "category"' in query: return pd.DataFrame({"category": ["A", "B", "C", "D", "E"]}) # Count stats if 'COUNT("count")' in query: return pd.DataFrame([{"count": 100}]) if 'COUNT(DISTINCT "count")' in query: return pd.DataFrame([{"count": 100}]) if 'AVG("count")' in query: return pd.DataFrame([{"avg": 10.0, "min": 1, "max": 20}]) # ID stats (fix for IndexError) if 'COUNT("id")' in query: return pd.DataFrame([{"count": 100}]) if 'COUNT(DISTINCT "id")' in query: return pd.DataFrame([{"count": 100}]) if 'AVG("id")' in query: return pd.DataFrame([{"avg": 50.0, "min": 1, "max": 100}]) return pd.DataFrame() mock_db_client.query_df.side_effect = side_effect # Run inspection inspect_db_table(mock_db_client, data_dir=data_dir) # Read summary to verify summary = get_data_summary(data_dir) assert summary is not None assert "test_table" in summary assert "category" in summary assert "distinct_values" in summary assert "unique_count: 5" in summary assert "- A" in summary assert "- E" in summary assert "primary_key: id" in summary def test_get_data_summary_none(tmp_path): """Test get_data_summary when file doesn't exist.""" assert get_data_summary(str(tmp_path)) is None def test_inspect_db_table_temporal(mock_db_client, tmp_path): """Test inspect_db_table with temporal columns.""" data_dir = str(tmp_path) columns_df = pd.DataFrame({ "column_name": ["created_at"], "data_type": ["timestamp without time zone"] }) total_rows_df = pd.DataFrame([{"count": 50}]) pk_df = pd.DataFrame() # No PK def side_effect(query): if "information_schema.columns" in query: return columns_df if "COUNT(*)" in query: return total_rows_df if "information_schema.key_column_usage" in query: return pk_df if 'COUNT("created_at")' in query: return pd.DataFrame([{"count": 50}]) if 'COUNT(DISTINCT "created_at")' in query: return pd.DataFrame([{"count": 50}]) if 'MIN("created_at")' in query: return pd.DataFrame([{"min": "2023-01-01", "max": "2023-12-31"}]) return pd.DataFrame() mock_db_client.query_df.side_effect = side_effect inspect_db_table(mock_db_client, data_dir=data_dir) summary = get_data_summary(data_dir) assert "created_at" in summary assert "min: '2023-01-01'" in summary assert "max: '2023-12-31'" in summary def test_inspect_db_table_high_cardinality(mock_db_client, tmp_path): """Test inspect_db_table with high cardinality categorical column (no sample values).""" data_dir = str(tmp_path) columns_df = pd.DataFrame({ "column_name": ["user_id"], "data_type": ["text"] }) total_rows_df = pd.DataFrame([{"count": 100}]) pk_df = pd.DataFrame() def side_effect(query): if "information_schema.columns" in query: return columns_df if "COUNT(*)" in query: return total_rows_df if "information_schema.key_column_usage" in query: return pk_df if 'COUNT("user_id")' in query: return pd.DataFrame([{"count": 100}]) if 'COUNT(DISTINCT "user_id")' in query: # High cardinality > 20 return pd.DataFrame([{"count": 50}]) return pd.DataFrame() mock_db_client.query_df.side_effect = side_effect inspect_db_table(mock_db_client, data_dir=data_dir) summary = get_data_summary(data_dir) assert "user_id" in summary assert "unique_count: 50" in summary # Should NOT have distinct_values assert "distinct_values" not in summary def test_inspect_db_table_checksum_skip(mock_db_client, tmp_path): """Test that inspection is skipped if checksum matches.""" data_dir = str(tmp_path) table = "test_table" # 1. Create a fake checksum file os.makedirs(data_dir, exist_ok=True) # Checksum is md5 of "ins|upd|del". Let's say mock returns "my_hash" # Mock checksum query mock_db_client.query_df.return_value = pd.DataFrame([{"dml_hash": "my_hash"}]) # Write existing checksum with open(os.path.join(data_dir, "checksum"), "w") as f: f.write(f"{table}:my_hash\n") # Write existing inspection with open(os.path.join(data_dir, "inspection.yaml"), "w") as f: f.write(f"{table}: {{ existing: true }}") # Run inspection result = inspect_db_table(mock_db_client, data_dir=data_dir) # Should return existing content assert "existing: true" in result # query_df should be called ONLY for checksum (once) # verify count of calls? # Logic: 1 call for checksum. If match, return. assert mock_db_client.query_df.call_count == 1