fio-l1-nyc-csvwkt-addresses
Map (reference ← swipe → agent)
0:00
The vendor SQL export `nyc_addresses` needs proper GeoParquet so `WHERE recorded_at > '2024-01-01'` and `SUM(unit_count)` work without CASTs. Convert it to `addresses.geoparquet`: Point geometry in EPSG:4326, `recorded_at` as `timestamp[us]`, `unit_count` as `int32`, all other columns as string.
0:00
System info
{
"adapter": "openrouter",
"model": "google/gemma-4-26b-a4b-it",
"base_url": "https://openrouter.ai/api/v1"
}
0:01
Bash
ls -F
nyc_addresses.csv
0:02
Read
nyc_addresses.csv
1 "id","country","postcode","street","number","unit","postal_city","recorded_at","unit_count","geometry_wkt"
2 "0005eda0-2540-4e36-bc81-dfb52e323d99","US","10002","ALLEN ST","37","1/2","","2024-01-01T00:00:00Z","0","POINT (-73.9922338 40.716163)"
3 "00587e13-6253-485d-9258-e47d43af7559","US","10038","MADISON ST","45","","","2024-01-12T07:13:17Z","0","POINT (-73.9983308 40.71204)"
4 "0070f126-d36a-43b3-b767-a8fbb7156602","US","10038","PECK SLIP","44","","","2024-01-23T14:26:34Z","0","POINT (-74.0015005 40.7073293)"
5 "00a8fc38-5d9a-42da-94a9-7d2f4101f82c","US","11201","FRONT ST","11","","","2024-02-03T21:39:51Z","0","POINT (-73.9928578 40.702681)"
6 "00f815af-51f6-44c2-b0d6-ee23092d14b2","US","10280","ALBANY ST","340","","","2024-02-14T04:52:08Z","1","POINT (-74.0175967 40.7102422)"
7 "014806e3-0deb-4be3-abd5-2be10dedf40c","US","10013","CHURCH ST","299","","","2024-02-25T11:05:25Z","0","POINT (-74.0043074 40.7193005)"
8 "018c3b29-04d1-4890-a12b-8098f6f0c7de","US","10013","WALKER ST","37","","","2024-03-07T18:18:42Z","0","POINT (-74.0040214 40.7192277)"
9 "01cbed19-2940-4b4a-9d90-6f0be7f1b604","US","10012","MOTT ST","184","","","2024-03-18T01:31:59Z","0","POINT (-73.9957225 40.7207067)"
10 "01fadb51-074c-4050-a670-4a615337d972","US","10002","E BROADWAY","53","","","2024-03-29T08:44:16Z","2","POINT (-73.9955533 40.7135006)"
0:11
Write
solve.py
content · 54 lines · py
import pandas as pd
import geopandas as gpd
from shapely import wkt
import pyarrow as pa
import pyarrow.parquet as pq
def convert_csv_to_geoparquet(input_csv, output_parquet):
print(f"Reading {input_csv}...")
# Load CSV
df = pd.read_csv(input_csv)
print("Converting geometry...")
# Convert WKT to shapely geometry
df['geometry'] = df['geometry_wkt'].apply(wkt.loads)
# Create GeoDataFrame
gdf = gpd.GeoDataFrame(df, geometry='geometry', crs="EPSG:4326")
# Drop the original WKT column
gdf = gdf.drop(columns=['geometry_wkt'])
print("Converting data types...")
# recorded_at as timestamp[us]
gdf['recorded_at'] = pd.to_datetime(gdf['recorded_at']).dt.floor('us')
# unit_count as int32
gdf['unit_count'] = gdf['unit_count'].astype('int32')
# All other columns as string
# Identify non-geometry, non-timestamp, non-int columns
cols_to_string = []
for col in gdf.columns:
if col == 'geometry':
continue
if col == 'recorded_at':
continue
if col == 'unit_count':
continue
cols_to_string.append(col)
for col in cols_to_string:
gdf[col] = gdf[col].astype(str)
# Ensure column order and structure
# To be safe with GeoParquet, we'll write using pyarrow to be explicit about types if needed,
# but geopandas to_parquet usually handles it well.
print(f"Writing to {output_parquet}...")
gdf.to_parquet(output_parquet, index=False)
print("Done!")
if __name__ == "__main__":
convert_csv_to_geoparquet('nyc_addresses.csv', 'addresses.geoparquet')
Wrote solve.py (1667 chars).
0:15
Bash
python solve.py
Reading nyc_addresses.csv... Converting geometry... Converting data types... Writing to addresses.geoparquet... Done!
0:20
Bash
python -c "import duckdb; con = duckdb.connect(); con.execute('INSTALL spatial; ...
100% ▕██████████████████████████████████████▏ (00:00:04.34 elapsed)
Traceback (most recent call last):
File "<string>", line 1, in <module>
import duckdb; con = duckdb.connect(); con.execute('INSTALL spatial; LOAD spatial;'); print(con.execute("DESCRIBE SELECT * FROM st_read('addresses.geoparquet')").df())
~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
_duckdb.IOException: IO Error: Could not open GDAL dataset at: addresses.geoparquet
LINE 1: DESCRIBE SELECT * FROM st_read('addresses.geoparquet')
^
0:29
Bash
python -c "import geopandas as gpd; gdf = gpd.read_parquet('addresses.geoparquet...
id str
country str
postcode str
street str
number str
unit str
postal_city str
recorded_at datetime64[us, UTC]
unit_count int32
geometry geometry
dtype: object
id ... geometry
0 0005eda0-2540-4e36-bc81-dfb52e323d99 ... POINT (-73.99223 40.71616)
1 00587e13-6253-485d-9258-e47d43af7559 ... POINT (-73.99833 40.71204)
2 0070f126-d36a-43b3-b767-a8fbb7156602 ... POINT (-74.0015 40.70733)
3 00a8fc38-5d9a-42da-94a9-7d2f4101f82c ... POINT (-73.99286 40.70268)
4 00f815af-51f6-44c2-b0d6-ee23092d14b2 ... POINT (-74.0176 40.71024)
[5 rows x 10 columns]
0:48
Write
solve.py
content · 40 lines · py
import pandas as pd
import geopandas as gpd
from shapely import wkt
def convert_csv_to_geoparquet(input_csv, output_parquet):
print(f"Reading {input_csv}...")
df = pd.read_csv(input_csv)
print("Converting geometry...")
df['geometry'] = df['geometry_wkt'].apply(wkt.loads)
gdf = gpd.GeoDataFrame(df, geometry='geometry', crs="EPSG:4326")
gdf = gdf.drop(columns=['geometry_wkt'])
print("Converting data types...")
# recorded_at as timestamp[us]
# pandas datetime64[us] corresponds to timestamp[us] in Arrow/Parquet
gdf['recorded_at'] = pd.to_datetime(gdf['recorded_at'], utc=True).dt.floor('us')
# unit_count as int32
gdf['unit_count'] = gdf['unit_count'].astype('int32')
# All other columns as string
# We must ensure that 'id', 'country', 'postcode', 'street', 'number', 'unit', 'postal_city' are strings.
# In the previous run, 'id' was already str, etc.
# Let's be explicit.
string_cols = ['id', 'country', 'postcode', 'street', 'number', 'unit', 'postal_city']
for col in string_cols:
if col in gdf.columns:
gdf[col] = gdf[col].astype(str)
# Replace "nan" strings if they were actually NaNs in the CSV
gdf[col] = gdf[col].replace('nan', '')
print(f"Writing to {output_parquet}...")
# Using engine='pyarrow' to ensure better Parquet compliance
gdf.to_parquet(output_parquet, index=False, engine='pyarrow')
print("Done!")
if __name__ == "__main__":
convert_csv_to_geoparquet('nyc_addresses.csv', 'addresses.geoparquet')
Wrote solve.py (1580 chars).
0:50
Bash
python solve.py
Reading nyc_addresses.csv... Converting geometry... Converting data types... Writing to addresses.geoparquet... Done!
0:55
Bash
python -c "import duckdb; con = duckdb.connect(); con.execute('INSTALL spatial; ...
Traceback (most recent call last):
File "<string>", line 1, in <module>
import duckdb; con = duckdb.connect(); con.execute('INSTALL spatial; LOAD spatial;'); print(con.execute("DESCRIBE SELECT * FROM 'addresses.geoparquet'").df())
~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
_duckdb.BinderException: Binder Error: No extension found that is capable of reading the file "addresses.geoparquet"
* If this file is a supported file format you can explicitly use the reader functions, such as read_csv, read_json or read_parquet
0:57
Bash
python -c "import duckdb; con = duckdb.connect(); print(con.execute(\"DESCRIBE S...
Traceback (most recent call last):
File "<string>", line 1, in <module>
import duckdb; con = duckdb.connect(); print(con.execute("DESCRIBE SELECT * FROM 'addresses.geoparquet'").df())
~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
_duckdb.BinderException: Binder Error: No extension found that is capable of reading the file "addresses.geoparquet"
* If this file is a supported file format you can explicitly use the reader functions, such as read_csv, read_json or read_parquet
0:59
Bash
python -c "import duckdb; con = duckdb.connect(); con.execute('INSTALL spatial; ...
column_name column_type null key default extra
0 id VARCHAR YES None None None
1 country VARCHAR YES None None None
2 postcode VARCHAR YES None None None
3 street VARCHAR YES None None None
4 number VARCHAR YES None None None
5 unit VARCHAR YES None None None
6 postal_city VARCHAR YES None None None
7 recorded_at TIMESTAMP WITH TIME ZONE YES None None None
8 unit_count INTEGER YES None None None
9 geometry GEOMETRY('EPSG:4326') YES None None None
1:03
Bash
python -c "import duckdb; con = duckdb.connect(); con.execute('INSTALL spatial; ...
sum(unit_count) 0 1566.0
1:13