run-20260617-194747Z / dd-l2-bangkok-multicrs-audit

0:00

A contractor just shipped `bangkok_contractor_delivery.gpkg` and I need a defect cite-sheet before I sign off. Please walk every layer in the file and write one row per layer to `crs_audit.csv` with these columns: layer_name, declared_crs (formatted as `EPSG:NNNN`), geometry_type, feature_count, sample_x and sample_y for a sample coordinate taken from the layer as it sits on disk, and encoding_detected. For encoding_detected, look at the text attributes for anomalies and report either `latin1-mojibake` or `utf-8`.

0:00

System info

{
  "adapter": "openrouter",
  "model": "google/gemma-4-26b-a4b-it",
  "base_url": "https://openrouter.ai/api/v1"
}

0:01

Bash ls -l bangkok_contractor_delivery.gpkg

-rw-r--r-- 1 agent agent 2703360 Jun 17 20:25 bangkok_contractor_delivery.gpkg

0:29

Write /work/solve.py

content · 99 lines · py

import geopandas as gpd
import pandas as pd
import fiona
from fiona.crs import CRS

def check_encoding(df):
    """
    Heuristic to check if a dataframe has mojibake.
    We check string columns for common mojibake patterns or 
    sequences that suggest incorrect encoding.
    """
    # If no string columns, return utf-8 as a default
    str_cols = df.select_dtypes(include=['object']).columns
    if len(str_cols) == 0:
        return "utf-8"
    
    for col in str_cols:
        # Check for common mojibake indicators like Ã, ©, etc. 
        # in contexts where they don't belong, but a simpler way is to 
        # try to see if the column can be encoded/decoded cleanly if it were bytes.
        # However, since it's already loaded as a string, we look for "garbage" characters.
        # A common sign of latin1-mojibake in utf-8 is the presence of 'Ã' followed by 
        # something else.
        sample_text = df[col].dropna().astype(str).head(100).tolist()
        for text in sample_text:
            # A very basic heuristic: if we see characters like Ã followed by something
            # that looks like a single character's byte representation.
            # Example: 'Ã©' instead of 'é'
            if "Ã" in text:
                return "latin1-mojibake"
    
    return "utf-8"

def audit_gpkg(file_path, output_csv):
    layers = fiona.listlayers(file_path)
    audit_results = []

    for layer in layers:
        gdf = gpd.read_file(file_path, layer=layer)
        
        # layer_name
        layer_name = layer
        
        # declared_crs
        # fiona/geopandas handles CRS. We want EPSG:NNNN format.
        # If it's a custom CRS or missing, we'll try to format it or use 'Unknown'
        try:
            if gdf.crs is not None:
                # Use to_epsg() if available, otherwise string representation
                epsg_code = gdf.crs.to_epsg()
                declared_crs = f"EPSG:{epsg_code}" if epsg_code else str(gdf.crs)
            else:
                declared_crs = "Unknown"
        except Exception:
            declared_crs = "Unknown"

        # geometry_type
        geometry_type = gdf.geometry.type.iloc[0] if not gdf.empty else "None"
        
        # feature_count
        feature_count = len(gdf)
        
        # sample_x and sample_y
        sample_x = None
        sample_y = None
        if not gdf.empty:
            sample_point = gdf.geometry.iloc[0]
            if sample_point.geom_type == 'Point':
                sample_x = sample_point.x
                sample_y = sample_point.y
            elif sample_point.geom_type in ['LineString', 'MultiLineString', 'Polygon', 'MultiPolygon']:
                # Get representative point or centroid
                rep_point = sample_point.centroid
                sample_x = rep_point.x
                sample_y = rep_point.y
            elif sample_point.geom_type == 'MultiPoint':
                sample_x = sample_point.geoms[0].x
                sample_y = sample_point.geoms[0].y

        # encoding_detected
        encoding = check_encoding(gdf)
        
        audit_results.append({
            "layer_name": layer_name,
            "declared_crs": declared_crs,
            "geometry_type": geometry_type,
            "feature_count": feature_count,
            "sample_x": sample_x,
            "sample_y": sample_y,
            "encoding_detected": encoding
        })

    audit_df = pd.DataFrame(audit_results)
    audit_df.to_csv(output_csv, index=False)
    print(f"Audit complete. Results written to {output_csv}")

if __name__ == "__main__":
    audit_gpkg("bangkok_contractor_delivery.gpkg", "crs_audit.csv")

Wrote /work/solve.py (3657 chars).