Skip to main content

upload-squirrel-hectare-data

Data from data.cityofnewyork.us

import requests 
import uuid
response = requests.get("https://data.cityofnewyork.us/api/views/ej9h-v6g2/rows.csv?fourfour=ej9h-v6g2&cacheBust=1681846116&date=20240905&accessType=DOWNLOAD")
print(response.status_code)
from yt import wrapper as yt
from yt import type_info
username = yt.get_user_name()
if yt.exists(f"//sys/users/{username}/@user_info/home_path"):
home = yt.get(f"//sys/users/{username}/@user_info/home_path")
working_dir = f"{home}/{uuid.uuid4().hex}"
else:
working_dir = f"//tmp/examples/{uuid.uuid4().hex}"
yt.create("map_node", working_dir)
print(working_dir)
import io
import csv
def string_to_int(value):
if value is None:
return None
return int(value)
content = io.StringIO(response.text)
csv_reader = csv.DictReader(content)
data = list(csv_reader)
for record in data:
for key, value in record.items():
if value == "":
record[key] = None
for key in ["Number of sighters", "Total Time of Sighting", "Number of Squirrels"]:
record[key] = string_to_int(record[key])
for line in data[:10]:
print(line)
schema = yt.schema.TableSchema()
schema.add_column("hectare", type_info.String)
schema.add_column("shift", type_info.String)
schema.add_column("date", type_info.String)
schema.add_column("sighter", type_info.String)
schema.add_column("sighter_observed_weather_data", type_info.Optional[type_info.String])
schema.add_column("litter", type_info.Optional[type_info.String])
schema.add_column("litter_notes", type_info.Optional[type_info.String])
schema.add_column("other_animals_sightings", type_info.Optional[type_info.String])
schema.add_column("hectare_conditions", type_info.Optional[type_info.String])
schema.add_column("hectare_conditions_notes", type_info.Optional[type_info.String])
schema.add_column("number_of_sighters", type_info.Uint16)
schema.add_column("number_of_squirrels", type_info.Uint16)
schema.add_column("total_time_of_sighting", type_info.Optional[type_info.Uint16])
csv_columns = ['Hectare', 'Shift', 'Date', 'Anonymized Sighter', 'Sighter Observed Weather Data', 'Litter', 'Litter Notes', 'Other Animal Sightings', 'Hectare Conditions', 'Hectare Conditions Notes', 'Number of sighters', 'Number of Squirrels', 'Total Time of Sighting']
yt_columns = [col.name for col in schema.columns]

assert len(csv_columns) == len(yt_columns)

yt_data = []
for record in data:
yt_record = {}
for csv_key, yt_key in zip(csv_columns, yt_columns):
yt_record[yt_key] = record[csv_key]
yt_data.append(yt_record)

for record in yt_data[:10]:
print(record)
table_path = f"{working_dir}/squirrels-hectare-data"
print(table_path)
yt.create("table", table_path, force=True, attributes={"schema": schema.to_yson_type()})
yt.write_table(table_path, yt_data)
print(table_path)