run-20260528-1624Z / dd-l2-bangkok-multicrs-audit

0:00

A contractor just shipped `bangkok_contractor_delivery.gpkg` and I need a defect cite-sheet before I sign off. Please walk every layer in the file and write one row per layer to `crs_audit.csv` with these columns: layer_name, declared_crs (formatted as `EPSG:NNNN`), geometry_type, feature_count, sample_x and sample_y for a sample coordinate taken from the layer as it sits on disk, and encoding_detected. For encoding_detected, look at the text attributes for anomalies and report either `latin1-mojibake` or `utf-8`.

0:00

System info

{
  "adapter": "openrouter",
  "model": "google/gemma-4-26b-a4b-it",
  "base_url": "https://openrouter.ai/api/v1"
}

0:01

Bash ls -F

bangkok_contractor_delivery.gpkg

0:21

Write /work/solve.py

content · 95 lines · py

import geopandas as gpd
import fiona
import pandas as pd
from shapely.geometry import Point

def detect_encoding(gdf):
    """
    Heuristic to check if text columns contain mojibake.
    We check string columns for common mojibake patterns.
    """
    # Select string columns
    str_cols = gdf.select_dtypes(include=['object']).columns
    
    for col in str_cols:
        # Get non-null values
        samples = gdf[col].dropna().astype(str).unique()
        for s in samples:
            # A common sign of latin1-mojibake in utf-8 environments is 
            # seeing sequences like 'Ã©' instead of 'é'
            # We can check if the string contains characters that look like 
            # results of incorrect decoding.
            # For this task, a simple heuristic: if it looks like it has 
            # high-bit chars interpreted as multiple chars, we might flag it.
            # However, a more robust way is to see if encoding as utf-8 
            # and decoding back results in the same string, or looking for
            # specific patterns.
            
            # Simple check: if the string has characters that are typical of mojibake
            # e.g., 'Ã' followed by something.
            if 'Ã' in s:
                return 'latin1-mojibake'
                
    return 'utf-8'

def audit_gpkg(file_path):
    layers = fiona.listlayers(file_path)
    audit_data = []

    for layer_name in layers:
        gdf = gpd.read_file(file_path, layer=layer_name)
        
        # Declared CRS
        crs = gdf.crs
        if crs is not None and crs.is_epsg:
            declared_crs = f"EPSG:{crs.to_epsg()}"
        elif crs is not None:
            declared_crs = str(crs)
        else:
            declared_crs = "None"

        # Geometry Type
        geom_type = gdf.geometry.type.iloc[0] if not gdf.empty else "None"
        
        # Feature Count
        feature_count = len(gdf)
        
        # Sample Coordinates
        sample_x, sample_y = None, None
        if feature_count > 0:
            sample_geom = gdf.geometry.iloc[0]
            if hasattr(sample_geom, 'x'):
                sample_x = sample_geom.x
                sample_y = sample_geom.y
            elif sample_geom.geom_type == 'MultiPoint':
                # For MultiPoint, just take the first point
                sample_x = sample_geom.geoms[0].x
                sample_y = sample_geom.geoms[0].y
            elif sample_geom.geom_type == 'GeometryCollection':
                # Try to find a point in the collection
                for part in sample_geom.geoms:
                    if hasattr(part, 'x'):
                        sample_x = part.x
                        sample_y = part.y
                        break
        
        # Encoding detection
        encoding_detected = detect_encoding(gdf)

        audit_data.append({
            'layer_name': layer_name,
            'declared_crs': declared_crs,
            'geometry_type': geom_type,
            'feature_count': feature_count,
            'sample_x': sample_x,
            'sample_y': sample_y,
            'encoding_detected': encoding_detected
        })

    df = pd.DataFrame(audit_data)
    df.to_csv('crs_audit.csv', index=False)
    print("Audit complete. Results written to crs_audit.csv")

if __name__ == "__main__":
    audit_gpkg('bangkok_contractor_delivery.gpkg')

Wrote /work/solve.py (3369 chars).