データのキューブ、つまりLAT、LONG、TIMEを3次元として持ち、Temperatureを格納する単純なnetCDFファイルがあります。NumPy に Masked Array の形式で格納されます。以下のコードは、CSV 形式で抽出します。しかし、20 MB のファイルを処理するには非常に時間がかかります。つまり、反復ごとに 20 秒かかるため、合計で 4 * 548 * 20 秒 = 43840 秒 = 703 分 = 12 時間になります。
コメント TAKES_LONG_TIME のある行を見ると、さらに時間がかかります。セルごとに、Python から NumPy の C コードへの切り替えが行われていると思います。以下のシナリオでどのように解決できるかわかりません。お知らせ下さい。ありがとう。
# conda install -y -c conda-forge iris
import iris
import cf_units as unit
import numpy as np
import datetime
import urllib.request
from os import path
def make_data_object_name(dataset_name, year, month, day, hour, realization, forecast_period):
template_string = "prods_op_{}_{:02d}{:02d}{:02d}_{:02d}_{:02d}_{:03d}.nc"
return template_string.format(dataset_name, year, month, day, hour, realization, forecast_period)
def download_data_object(dataset_name, data_object_name):
url = "https://s3.eu-west-2.amazonaws.com/" + dataset_name + "/" + data_object_name
urllib.request.urlretrieve(url, data_object_name) # save in this directory with same name
def load_data():
filename = 'prods_op_mogreps-uk_20140101_03_02_003.nc'
if not path.exists(filename):
# obj_name = make_data_object_name('mogreps-uk', 2014, 1, 1, 3, 2, 3)
download_data_object('mogreps-uk', filename)
listofcubes = iris.load(filename)
air_temps = listofcubes.extract('air_temperature')
surface_temp = air_temps[0]
dim_time, dim_lat, dim_long = "time", "grid_latitude", "grid_longitude"
time_cords = surface_temp.coord(dim_time).points
time_since = str(surface_temp.coord(dim_time).units)
lat_cords = surface_temp.coord(dim_lat).points
long_cords = surface_temp.coord(dim_long).points
time_records = [str(unit.num2date(time_cords[i], time_since, unit.CALENDAR_STANDARD)) for i in
range(len(time_cords))]
lat_records = [lat_cords[lat_recorded] for lat_recorded in range(len(lat_cords))]
long_records = [long_cords[long_recorded] for long_recorded in range(len(long_cords))]
print(len(time_records), len(lat_records), len(long_records))
print(surface_temp.shape)
data_size = len(surface_temp.shape)
print(" File write start --> ", datetime.datetime.now())
with open(filename + '.curated', 'w') as filehandle:
for t, time_record in enumerate(time_records): # Iterate TIME - 4
t_a = surface_temp[t] if data_size == 3 else surface_temp[t][0]
for lat, lat_record in enumerate(lat_records): # Iterate LAT - 548
lat_a = t_a[lat]
iter_start_time = datetime.datetime.now()
lat_lines = list()
for lng, long_record in enumerate(long_records): # Iterate Long 421
data = str(lat_a[lng].data.min()) # TAKES_LONG_TIME
lat_lines.append(time_record + ',' + str(lat_record) + ',' + str(long_record) + ',' + data + '\n')
filehandle.writelines(lat_lines)
print(t, time_record, lat, lat_record, " time taken in seconds -> ",
(datetime.datetime.now() - iter_start_time).seconds)
if __name__ == "__main__":
load_data()