Agentic-Service-Data-Eyond-Catalog

Sleeping

App Files Files Community

Rifqi Hafizuddin commited on 24 days ago

Commit

430c361

1 Parent(s): 49feaa9

[KM-564] Edit column description in catalog to reduce token & ingestion time

Browse files

Files changed (5) hide show

src/catalog/introspect/database.py +9 -3
src/catalog/introspect/tabular.py +8 -6
src/catalog/models.py +2 -3
src/catalog/render.py +18 -8
src/pipeline/db_pipeline/extractor.py +76 -35

src/catalog/introspect/database.py CHANGED Viewed

@@ -130,7 +130,6 @@ class DatabaseIntrospector(BaseIntrospector):
             source_id=client_id,
             source_type="schema",
             name=client.name,
-            description="",
             location_ref=location_ref,
             updated_at=datetime.now(UTC),
             tables=tables,
@@ -160,6 +159,7 @@ class DatabaseIntrospector(BaseIntrospector):
                             col["name"],
                             col.get("is_numeric", False),
                             row_count,
                         )
                     except Exception as e:
                         logger.error(
@@ -177,7 +177,6 @@ class DatabaseIntrospector(BaseIntrospector):
                     Table(
                         table_id=_stable_id("t_", table_name),
                         name=table_name,
-                        description="",
                         row_count=row_count,
                         columns=columns,
                         foreign_keys=foreign_keys,
@@ -218,18 +217,25 @@ class DatabaseIntrospector(BaseIntrospector):
             _normalize(v) for v in (profile.get("sample_values") or [])
         ] or None
         column = Column(
             column_id=_stable_id("c_", table_name, name),
             name=name,
             data_type=_map_sql_type(str(col["type"])),
-            description="",
             nullable=True,  # nullable not surfaced by extractor; default permissive
             pii_flag=False,
             sample_values=sample_values,
             stats=ColumnStats(
                 min=_normalize(profile.get("min")),
                 max=_normalize(profile.get("max")),
                 distinct_count=profile.get("distinct_count"),
             ),
         )
         if self._pii.detect(column):

             source_id=client_id,
             source_type="schema",
             name=client.name,
             location_ref=location_ref,
             updated_at=datetime.now(UTC),
             tables=tables,
                             col["name"],
                             col.get("is_numeric", False),
                             row_count,
+                            is_temporal=col.get("is_temporal", False),
                         )
                     except Exception as e:
                         logger.error(
                     Table(
                         table_id=_stable_id("t_", table_name),
                         name=table_name,
                         row_count=row_count,
                         columns=columns,
                         foreign_keys=foreign_keys,
             _normalize(v) for v in (profile.get("sample_values") or [])
         ] or None
+        top_raw = profile.get("top_values") or []
+        top_values: list[Any] | None = [
+            _normalize(v) for v, _cnt in top_raw
+        ] or None
         column = Column(
             column_id=_stable_id("c_", table_name, name),
             name=name,
             data_type=_map_sql_type(str(col["type"])),
             nullable=True,  # nullable not surfaced by extractor; default permissive
             pii_flag=False,
             sample_values=sample_values,
             stats=ColumnStats(
                 min=_normalize(profile.get("min")),
                 max=_normalize(profile.get("max")),
+                mean=_normalize(profile.get("mean")),
+                median=_normalize(profile.get("median")),
                 distinct_count=profile.get("distinct_count"),
+                top_values=top_values,
             ),
         )
         if self._pii.detect(column):

src/catalog/introspect/tabular.py CHANGED Viewed

@@ -141,7 +141,6 @@ class TabularIntrospector(BaseIntrospector):
             source_id=document_id,
             source_type="tabular",
             name=doc.filename,
-            description="",
             location_ref=location_ref,
             updated_at=datetime.now(UTC),
             tables=tables,
@@ -183,7 +182,6 @@ class TabularIntrospector(BaseIntrospector):
         return Table(
             table_id=_stable_id("t_", *id_parts),
             name=table_name,
-            description="",
             row_count=len(df),
             columns=columns,
             foreign_keys=[],
@@ -200,7 +198,7 @@ class TabularIntrospector(BaseIntrospector):
             (document_id, sheet_name, col_name) if sheet_name else (document_id, col_name)
         )
-        sample_raw = series.dropna().head(5).tolist()
         sample_values: list[Any] | None = [_normalize(v) for v in sample_raw] or None
         is_numeric = pd.api.types.is_numeric_dtype(series)
@@ -212,9 +210,14 @@ class TabularIntrospector(BaseIntrospector):
             if distinct_count <= 10
             else None
         )
         stats = ColumnStats(
-            min=_normalize(non_null.min()) if (is_numeric or is_dt) and len(non_null) > 0 else None,
-            max=_normalize(non_null.max()) if (is_numeric or is_dt) and len(non_null) > 0 else None,
             distinct_count=distinct_count,
             top_values=top_values,
         )
@@ -223,7 +226,6 @@ class TabularIntrospector(BaseIntrospector):
             column_id=_stable_id("c_", *id_parts),
             name=col_name,
             data_type=_map_pandas_type(series.dtype),
-            description="",
             nullable=bool(series.isnull().any()),
             pii_flag=False,
             sample_values=sample_values,

             source_id=document_id,
             source_type="tabular",
             name=doc.filename,
             location_ref=location_ref,
             updated_at=datetime.now(UTC),
             tables=tables,
         return Table(
             table_id=_stable_id("t_", *id_parts),
             name=table_name,
             row_count=len(df),
             columns=columns,
             foreign_keys=[],
             (document_id, sheet_name, col_name) if sheet_name else (document_id, col_name)
         )
+        sample_raw = series.dropna().head(3).tolist()
         sample_values: list[Any] | None = [_normalize(v) for v in sample_raw] or None
         is_numeric = pd.api.types.is_numeric_dtype(series)
             if distinct_count <= 10
             else None
         )
+        has_values = len(non_null) > 0
+        wants_range = (is_numeric or is_dt) and has_values
+        wants_mean = is_numeric and has_values
         stats = ColumnStats(
+            min=_normalize(non_null.min()) if wants_range else None,
+            max=_normalize(non_null.max()) if wants_range else None,
+            mean=float(non_null.mean()) if wants_mean else None,
+            median=float(non_null.median()) if wants_mean else None,
             distinct_count=distinct_count,
             top_values=top_values,
         )
             column_id=_stable_id("c_", *id_parts),
             name=col_name,
             data_type=_map_pandas_type(series.dtype),
             nullable=bool(series.isnull().any()),
             pii_flag=False,
             sample_values=sample_values,

src/catalog/models.py CHANGED Viewed

@@ -34,6 +34,8 @@ DataType = Literal["int", "decimal", "string", "datetime", "date", "bool", "json
 class ColumnStats(BaseModel):
     min: Any | None = None
     max: Any | None = None
     distinct_count: int | None = None
     top_values: list[Any] | None = None
@@ -42,7 +44,6 @@ class Column(BaseModel):
     column_id: str
     name: str
     data_type: DataType
-    description: str
     nullable: bool
     pii_flag: bool = False
     sample_values: list[Any] | None = None
@@ -64,7 +65,6 @@ class ForeignKey(BaseModel):
 class Table(BaseModel):
     table_id: str
     name: str
-    description: str
     row_count: int | None = None
     columns: list[Column]
     foreign_keys: list[ForeignKey] = Field(default_factory=list)
@@ -74,7 +74,6 @@ class Source(BaseModel):
     source_id: str
     source_type: SourceType
     name: str
-    description: str
     location_ref: str
     updated_at: datetime
     tables: list[Table] = Field(default_factory=list)

 class ColumnStats(BaseModel):
     min: Any | None = None
     max: Any | None = None
+    mean: float | None = None
+    median: float | None = None
     distinct_count: int | None = None
     top_values: list[Any] | None = None
     column_id: str
     name: str
     data_type: DataType
     nullable: bool
     pii_flag: bool = False
     sample_values: list[Any] | None = None
 class Table(BaseModel):
     table_id: str
     name: str
     row_count: int | None = None
     columns: list[Column]
     foreign_keys: list[ForeignKey] = Field(default_factory=list)
     source_id: str
     source_type: SourceType
     name: str
     location_ref: str
     updated_at: datetime
     tables: list[Table] = Field(default_factory=list)

src/catalog/render.py CHANGED Viewed

@@ -8,13 +8,18 @@ from .models import Source
 def render_source(source: Source) -> str:
     """Render a Source as the canonical text block consumed by the planner.
-    Includes stable IDs (so the LLM can echo them back), per-column data
-    type, sample values (or `PII (suppressed)` for flagged columns), basic
-    stats, and resolved-by-name foreign keys.
     """
     lines: list[str] = [
         f"Source: {source.name} ({source.source_type})",
-        f"Source ID: {source.source_id}",
         "",
         "Tables:",
     ]
@@ -26,9 +31,9 @@ def render_source(source: Source) -> str:
     for table in source.tables:
         rc = table.row_count
-        rc_str = f"({rc:,} rows) " if rc is not None else ""
         lines.append("")
-        lines.append(f"  Table: {table.name} {rc_str}— id={table.table_id}")
         lines.append("  Columns:")
         for col in table.columns:
             samples = "PII (suppressed)" if col.pii_flag else (col.sample_values or [])
@@ -38,12 +43,17 @@ def render_source(source: Source) -> str:
                     stats_parts.append(f"min={col.stats.min}")
                 if col.stats.max is not None:
                     stats_parts.append(f"max={col.stats.max}")
                 if col.stats.distinct_count is not None:
                     stats_parts.append(f"distinct={col.stats.distinct_count}")
             stats_str = (", " + ", ".join(stats_parts)) if stats_parts else ""
             lines.append(
-                f"    - {col.name} [{col.data_type}]: samples={samples}{stats_str} "
-                f"— id={col.column_id}"
             )
         if table.foreign_keys:
             lines.append("  Foreign keys:")

 def render_source(source: Source) -> str:
     """Render a Source as the canonical text block consumed by the planner.
+    Identifiers (source_id / table_id / column_id) are intentionally NOT
+    rendered — the LLM references things by name, and the IR resolver maps
+    names back to stable IDs before validation. This saves ~10% input tokens
+    per planner call.
+    Columns show data type, sample values (or `PII (suppressed)`), and
+    populated stats only (min/max suppressed for string/bool, where they're
+    useless). Top values are listed when available for low-cardinality cols.
+    Foreign keys are resolved to names.
     """
     lines: list[str] = [
         f"Source: {source.name} ({source.source_type})",
         "",
         "Tables:",
     ]
     for table in source.tables:
         rc = table.row_count
+        rc_str = f" ({rc:,} rows)" if rc is not None else ""
         lines.append("")
+        lines.append(f"  Table: {table.name}{rc_str}")
         lines.append("  Columns:")
         for col in table.columns:
             samples = "PII (suppressed)" if col.pii_flag else (col.sample_values or [])
                     stats_parts.append(f"min={col.stats.min}")
                 if col.stats.max is not None:
                     stats_parts.append(f"max={col.stats.max}")
+                if col.stats.mean is not None:
+                    stats_parts.append(f"mean={col.stats.mean:.4g}")
+                if col.stats.median is not None:
+                    stats_parts.append(f"median={col.stats.median:.4g}")
                 if col.stats.distinct_count is not None:
                     stats_parts.append(f"distinct={col.stats.distinct_count}")
+                if col.stats.top_values:
+                    stats_parts.append(f"top={col.stats.top_values}")
             stats_str = (", " + ", ".join(stats_parts)) if stats_parts else ""
             lines.append(
+                f"    - {col.name} [{col.data_type}]: samples={samples}{stats_str}"
             )
         if table.foreign_keys:
             lines.append("  Foreign keys:")

src/pipeline/db_pipeline/extractor.py CHANGED Viewed

@@ -9,7 +9,7 @@ not user input.
 from typing import Optional
 import pandas as pd
-from sqlalchemy import Float, Integer, Numeric, inspect
 from sqlalchemy.engine import Engine
 from src.middlewares.logging import get_logger
@@ -17,10 +17,16 @@ from src.middlewares.logging import get_logger
 logger = get_logger("db_extractor")
 TOP_VALUES_THRESHOLD = 0.05  # show top values if distinct_ratio <= 5%
 # Dialects where PERCENTILE_CONT(...) WITHIN GROUP is supported as an aggregate.
 # MySQL has no percentile aggregate; BigQuery has PERCENTILE_CONT only as an
-# analytic (window) function — both drop median and keep min/max/mean.
 _MEDIAN_DIALECTS = frozenset({"postgresql", "mssql", "snowflake"})
@@ -53,7 +59,7 @@ def _qi(engine: Engine, name: str) -> str:
 def get_schema(
     engine: Engine, exclude_tables: Optional[frozenset[str]] = None
 ) -> dict[str, list[dict]]:
-    """Returns {table_name: [{name, type, is_numeric, is_primary_key, foreign_key}, ...]}."""
     exclude = exclude_tables or frozenset()
     inspector = inspect(engine)
     schema = {}
@@ -75,6 +81,7 @@ def get_schema(
                 "name": c["name"],
                 "type": str(c["type"]),
                 "is_numeric": isinstance(c["type"], (Integer, Numeric, Float)),
                 "is_primary_key": c["name"] in pk_cols,
                 "foreign_key": fk_map.get(c["name"]),
             }
@@ -96,8 +103,14 @@ def profile_column(
     col_name: str,
     is_numeric: bool,
     row_count: int,
 ) -> dict:
-    """Returns null_count, distinct_count, min/max, top values, and sample values."""
     if row_count == 0:
         return {
             "null_count": 0,
@@ -108,39 +121,69 @@ def profile_column(
     qt = _qi(engine, table_name)
     qc = _qi(engine, col_name)
-    # Combined stats query: null_count, distinct_count, and min/max (if numeric).
-    # One round-trip instead of two.
-    select_cols = [
         f"COUNT(*) - COUNT({qc}) AS nulls",
         f"COUNT(DISTINCT {qc}) AS distincts",
     ]
-    if is_numeric:
-        select_cols.append(f"MIN({qc}) AS min_val")
-        select_cols.append(f"MAX({qc}) AS max_val")
-        select_cols.append(f"AVG({qc}) AS mean_val")
-        if _supports_median(engine):
-            select_cols.append(
-                f"PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY {qc}) AS median_val"
-            )
-    stats = pd.read_sql(f"SELECT {', '.join(select_cols)} FROM {qt}", engine)
-    null_count = int(stats.iloc[0]["nulls"])
-    distinct_count = int(stats.iloc[0]["distincts"])
-    distinct_ratio = distinct_count / row_count if row_count > 0 else 0
-    profile = {
-        "null_count": null_count,
-        "distinct_count": distinct_count,
-        "distinct_ratio": round(distinct_ratio, 4),
-    }
-    if is_numeric:
-        profile["min"] = stats.iloc[0]["min_val"]
-        profile["max"] = stats.iloc[0]["max_val"]
-        profile["mean"] = stats.iloc[0]["mean_val"]
-        if _supports_median(engine):
             profile["median"] = stats.iloc[0]["median_val"]
     if 0 < distinct_ratio <= TOP_VALUES_THRESHOLD:
         top_sql = _head_query(
@@ -153,9 +196,6 @@ def profile_column(
         top = pd.read_sql(top_sql, engine)
         profile["top_values"] = list(zip(top.iloc[:, 0].tolist(), top["cnt"].tolist()))
-    sample = pd.read_sql(_head_query(engine, qc, qt, 5), engine)
-    profile["sample_values"] = sample.iloc[:, 0].tolist()
     return profile
@@ -273,7 +313,8 @@ def build_text(table_name: str, row_count: int, col: dict, profile: dict) -> str
     text += f"Distinct count: {profile['distinct_count']} ({profile['distinct_ratio']:.1%})\n"
     if "min" in profile:
         text += f"Min: {profile['min']}, Max: {profile['max']}\n"
-        text += f"Mean: {profile['mean']}\n"
         if profile.get("median") is not None:
             text += f"Median: {profile['median']}\n"
     if "top_values" in profile:

 from typing import Optional
 import pandas as pd
+from sqlalchemy import Date, DateTime, Float, Integer, Numeric, inspect
 from sqlalchemy.engine import Engine
 from src.middlewares.logging import get_logger
 logger = get_logger("db_extractor")
 TOP_VALUES_THRESHOLD = 0.05  # show top values if distinct_ratio <= 5%
+SAMPLE_LIMIT = 3             # sample N rows per column (down from 5 — token cost)
+# Dialects with a single-statement CTE that survives `pd.read_sql`. On these we
+# fold the stats and sample queries into one round-trip per column. MySQL <8 and
+# old SQLite are excluded out of caution.
+_CTE_DIALECTS = frozenset({"postgresql", "mssql", "snowflake", "bigquery"})
 # Dialects where PERCENTILE_CONT(...) WITHIN GROUP is supported as an aggregate.
 # MySQL has no percentile aggregate; BigQuery has PERCENTILE_CONT only as an
+# analytic (window) function — both drop median and keep mean.
 _MEDIAN_DIALECTS = frozenset({"postgresql", "mssql", "snowflake"})
 def get_schema(
     engine: Engine, exclude_tables: Optional[frozenset[str]] = None
 ) -> dict[str, list[dict]]:
+    """Returns {table_name: [{name, type, is_numeric, is_temporal, is_primary_key, foreign_key}, ...]}."""
     exclude = exclude_tables or frozenset()
     inspector = inspect(engine)
     schema = {}
                 "name": c["name"],
                 "type": str(c["type"]),
                 "is_numeric": isinstance(c["type"], (Integer, Numeric, Float)),
+                "is_temporal": isinstance(c["type"], (Date, DateTime)),
                 "is_primary_key": c["name"] in pk_cols,
                 "foreign_key": fk_map.get(c["name"]),
             }
     col_name: str,
     is_numeric: bool,
     row_count: int,
+    is_temporal: bool = False,
 ) -> dict:
+    """Returns null_count, distinct_count, min/max (numeric+temporal), mean/median (numeric), and sample values.
+    Numeric columns compute mean and (where the dialect supports it) median.
+    Datetime/date get min/max only (no useful mean/median over timestamps).
+    Strings/bools skip range stats entirely.
+    """
     if row_count == 0:
         return {
             "null_count": 0,
     qt = _qi(engine, table_name)
     qc = _qi(engine, col_name)
+    wants_range = is_numeric or is_temporal
+    wants_mean = is_numeric
+    wants_median = is_numeric and _supports_median(engine)
+    profile: dict = {}
+    # Build the stats SELECT list incrementally — same column set used in both
+    # the CTE and fallback branches.
+    stat_cols = [
         f"COUNT(*) - COUNT({qc}) AS nulls",
         f"COUNT(DISTINCT {qc}) AS distincts",
     ]
+    if wants_range:
+        stat_cols += [f"MIN({qc}) AS min_val", f"MAX({qc}) AS max_val"]
+    if wants_mean:
+        stat_cols.append(f"AVG({qc}) AS mean_val")
+    if wants_median:
+        stat_cols.append(
+            f"PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY {qc}) AS median_val"
+        )
+    if engine.dialect.name in _CTE_DIALECTS:
+        # Single round-trip: stats + sample together via CTE.
+        stats_select = ", ".join(stat_cols)
+        passthrough = ", ".join(
+            f"s.{c.split(' AS ')[-1]}" for c in stat_cols
+        )
+        sql = (
+            f"WITH stats AS (SELECT {stats_select} FROM {qt}), "
+            f"sample AS ({_head_query(engine, qc + ' AS sample_val', qt, SAMPLE_LIMIT)}) "
+            f"SELECT {passthrough}, sample.sample_val FROM stats s CROSS JOIN sample"
+        )
+        rows = pd.read_sql(sql, engine)
+        null_count = int(rows.iloc[0]["nulls"])
+        distinct_count = int(rows.iloc[0]["distincts"])
+        sample_values = rows["sample_val"].tolist()
+        if wants_range:
+            profile["min"] = rows.iloc[0]["min_val"]
+            profile["max"] = rows.iloc[0]["max_val"]
+        if wants_mean:
+            profile["mean"] = rows.iloc[0]["mean_val"]
+        if wants_median:
+            profile["median"] = rows.iloc[0]["median_val"]
+    else:
+        # Two-query fallback (MySQL/SQLite).
+        stats = pd.read_sql(f"SELECT {', '.join(stat_cols)} FROM {qt}", engine)
+        null_count = int(stats.iloc[0]["nulls"])
+        distinct_count = int(stats.iloc[0]["distincts"])
+        if wants_range:
+            profile["min"] = stats.iloc[0]["min_val"]
+            profile["max"] = stats.iloc[0]["max_val"]
+        if wants_mean:
+            profile["mean"] = stats.iloc[0]["mean_val"]
+        if wants_median:
             profile["median"] = stats.iloc[0]["median_val"]
+        sample = pd.read_sql(_head_query(engine, qc, qt, SAMPLE_LIMIT), engine)
+        sample_values = sample.iloc[:, 0].tolist()
+    distinct_ratio = distinct_count / row_count if row_count > 0 else 0
+    profile["null_count"] = null_count
+    profile["distinct_count"] = distinct_count
+    profile["distinct_ratio"] = round(distinct_ratio, 4)
+    profile["sample_values"] = sample_values
     if 0 < distinct_ratio <= TOP_VALUES_THRESHOLD:
         top_sql = _head_query(
         top = pd.read_sql(top_sql, engine)
         profile["top_values"] = list(zip(top.iloc[:, 0].tolist(), top["cnt"].tolist()))
     return profile
     text += f"Distinct count: {profile['distinct_count']} ({profile['distinct_ratio']:.1%})\n"
     if "min" in profile:
         text += f"Min: {profile['min']}, Max: {profile['max']}\n"
+        if profile.get("mean") is not None:
+            text += f"Mean: {profile['mean']}\n"
         if profile.get("median") is not None:
             text += f"Median: {profile['median']}\n"
     if "top_values" in profile: