Skip to main content

Upload, download and use files in YTsaurus operations

This notebook contains examples of how to store large binary data on YTsaurus as files.

This notebooks demonstrates how to:

  1. Upload file.
  2. Read file.
  3. Use file in operation.

Files can be useful in cases:

  1. Operations require using dictionaries (e.g., for working with geodata, currency rates, etc)
  2. For saving checkpoints or a trained model.
  3. You need to store data in a reliable storage.
  4. We store big Jupyter notebooks as files:)
from yt import wrapper as yt 
import uuid
import json

Create a base directory for examples

username = yt.get_user_name()
if yt.exists(f"//sys/users/{username}/@user_info/home_path"):
home = yt.get(f"//sys/users/{username}/@user_info/home_path")
working_dir = f"{home}/{uuid.uuid4().hex}"
else:
working_dir = f"//tmp/examples/{uuid.uuid4().hex}"
yt.create("map_node", working_dir)
print(working_dir)

Upload and download file as serialized json

rates = {"USD":1,"AED":3.6725,"AFN":70.930145,"ALL":89.880745,"AMD":388.084007,"ANG":1.79,"AOA":910.532308,"ARS":948.17,"AUD":1.474829,"AWG":1.79,"AZN":1.699237,"BAM":1.749036,"BBD":2,"BDT":119.497266,"BGN":1.749165,"BHD":0.376,"BIF":2900.267644,"BMD":1,"BND":1.302522,"BOB":6.918444,"BRL":5.563555,"BSD":1,"BTN":83.858202,"BWP":13.368056,"BYN":3.25702,"BZD":2,"CAD":1.351507,"CDF":2821.533873,"CHF":0.847992,"CLP":914.946844,"CNY":7.125721,"COP":4032.124348,"CRC":523.251219,"CUP":24,"CVE":98.606453,"CZK":22.409394,"DJF":177.721,"DKK":6.669549,"DOP":59.725937,"DZD":134.240444,"EGP":48.773605,"ERN":15,"ETB":110.793381,"EUR":0.894271,"FJD":2.209201,"FKP":0.758066,"FOK":6.669253,"GBP":0.75807,"GEL":2.698374,"GGP":0.758066,"GHS":15.646015,"GIP":0.758066,"GMD":70.090529,"GNF":8687.65698,"GTQ":7.739001,"GYD":209.238751,"HKD":7.796834,"HNL":24.795486,"HRK":6.737862,"HTG":131.727086,"HUF":351.816224,"IDR":15463.537936,"ILS":3.689802,"IMP":0.758066,"INR":83.858211,"IQD":1307.71534,"IRR":42046.557011,"ISK":136.842504,"JEP":0.758066,"JMD":156.609396,"JOD":0.709,"JPY":144.238461,"KES":128.860802,"KGS":85.514332,"KHR":4063.922959,"KID":1.474819,"KMF":439.950981,"KRW":1326.065473,"KWD":0.305137,"KYD":0.833333,"KZT":483.412984,"LAK":21958.776695,"LBP":89500,"LKR":299.54412,"LRD":195.128633,"LSL":17.727478,"LYD":4.761189,"MAD":9.651574,"MDL":17.440239,"MGA":4559.019151,"MKD":55.302516,"MMK":2099.920135,"MNT":3394.316353,"MOP":8.031133,"MRU":39.757746,"MUR":46.041061,"MVR":15.425412,"MWK":1734.083284,"MXN":19.262175,"MYR":4.37505,"MZN":63.668821,"NAD":17.727478,"NGN":1572.880665,"NIO":36.923597,"NOK":10.483587,"NPR":134.173123,"NZD":1.607849,"OMR":0.384497,"PAB":1,"PEN":3.743768,"PGK":3.892274,"PHP":56.230734,"PKR":278.734522,"PLN":3.824356,"PYG":7598.969319,"QAR":3.64,"RON":4.471734,"RSD":105.116406,"RUB":91.507333,"RWF":1331.048915,"SAR":3.75,"SBD":8.500735,"SCR":13.837586,"SDG":458.303626,"SEK":10.200087,"SGD":1.302527,"SHP":0.758066,"SLE":22.414795,"SLL":22414.795049,"SOS":571.574926,"SRD":29.173375,"SSP":2809.067693,"STN":21.909564,"SYP":13122.06765,"SZL":17.727478,"THB":34.138685,"TJS":10.605013,"TMT":3.499058,"TND":3.045497,"TOP":2.324466,"TRY":34.030627,"TTD":6.784157,"TVD":1.474819,"TWD":31.688138,"TZS":2699.373687,"UAH":41.241903,"UGX":3715.735216,"UYU":40.352563,"UZS":12685.942826,"VES":36.5888,"VND":24991.37597,"VUV":117.995378,"WST":2.694395,"XAF":586.601308,"XCD":2.7,"XDR":0.742859,"XOF":586.601308,"XPF":106.714781,"YER":250.215458,"ZAR":17.727644,"ZMW":26.177908,"ZWL":13.8134}
json_rates = json.dumps(rates).encode("utf-8")

write_file can wright arbitrary binary data

file_path = f"{working_dir}/file"
yt.write_file(file_path, json_rates)
content = yt.read_file(file_path).read()
json.loads(content).keys()

Use files in operations

def mapper(row):
# file's name is equial to table's name
assert "USD" in open("file").read()
src = f"{working_dir}/fake_src"
dst = f"{working_dir}/fake_dst"

yt.write_table(src, [{"x": 1}])

yt_files parameter makes operation download file from cypress and save it on job's file system. Local file has the same name as cypress node.

yt.run_map(
mapper,
src,
dst,
yt_files=[file_path],
)