# Save larger files from bucket to bucket using forces minIO
<div class="alert alert-danger alert-info">
    <b>It is important to save your results in a place that can last longer than a few days/weeks!</b>
</div>
- When you have saved data locally on your JupyterLab instance and you want to mak a backup on https://forces2021.uiogeo-apps.sigma2.no/

In [2]:
import os
import pathlib
import s3fs
import xarray as xr

## Connect to bucket (anonymous login for public data only)

In [3]:
fs = s3fs.S3FileSystem(anon=True,
      client_kwargs={
         'endpoint_url': 'https://climate.uiogeo-apps.sigma2.no/'
      })

## Get data into xarray

In [26]:
s3path = 's3://ESGF/CMIP6/GeoMIP/MPI-M/*/G6sulfur/*/day/tasmin/gn/*/*.nc'

In [27]:
remote_files = fs.glob(s3path)

In [28]:
remote_files

['ESGF/CMIP6/GeoMIP/MPI-M/MPI-ESM1-2-LR/G6sulfur/r1i1p1f1/day/tasmin/gn/v20190710/tasmin_day_MPI-ESM1-2-LR_G6sulfur_r1i1p1f1_gn_20150101-20341231.nc',
 'ESGF/CMIP6/GeoMIP/MPI-M/MPI-ESM1-2-LR/G6sulfur/r1i1p1f1/day/tasmin/gn/v20190710/tasmin_day_MPI-ESM1-2-LR_G6sulfur_r1i1p1f1_gn_20350101-20541231.nc',
 'ESGF/CMIP6/GeoMIP/MPI-M/MPI-ESM1-2-LR/G6sulfur/r1i1p1f1/day/tasmin/gn/v20190710/tasmin_day_MPI-ESM1-2-LR_G6sulfur_r1i1p1f1_gn_20550101-20741231.nc',
 'ESGF/CMIP6/GeoMIP/MPI-M/MPI-ESM1-2-LR/G6sulfur/r1i1p1f1/day/tasmin/gn/v20190710/tasmin_day_MPI-ESM1-2-LR_G6sulfur_r1i1p1f1_gn_20750101-20941231.nc',
 'ESGF/CMIP6/GeoMIP/MPI-M/MPI-ESM1-2-LR/G6sulfur/r1i1p1f1/day/tasmin/gn/v20190710/tasmin_day_MPI-ESM1-2-LR_G6sulfur_r1i1p1f1_gn_20950101-20991231.nc',
 'ESGF/CMIP6/GeoMIP/MPI-M/MPI-ESM1-2-LR/G6sulfur/r2i1p1f1/day/tasmin/gn/v20190710/tasmin_day_MPI-ESM1-2-LR_G6sulfur_r2i1p1f1_gn_20150101-20341231.nc',
 'ESGF/CMIP6/GeoMIP/MPI-M/MPI-ESM1-2-LR/G6sulfur/r2i1p1f1/day/tasmin/gn/v20190710/tasmin_day_M

In [29]:
# Iterate through remote_files to create a fileset
fileset = [fs.open(file) for file in remote_files]

# This works
dset = xr.open_mfdataset(fileset, combine='by_coords', use_cftime=True)

In [30]:
dset

Unnamed: 0,Array,Chunk
Bytes,502.58 kB,116.88 kB
Shape,"(31411, 2)","(7305, 2)"
Count,15 Tasks,5 Chunks
Type,object,numpy.ndarray
"Array Chunk Bytes 502.58 kB 116.88 kB Shape (31411, 2) (7305, 2) Count 15 Tasks 5 Chunks Type object numpy.ndarray",2  31411,

Unnamed: 0,Array,Chunk
Bytes,502.58 kB,116.88 kB
Shape,"(31411, 2)","(7305, 2)"
Count,15 Tasks,5 Chunks
Type,object,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,48.25 MB,11.22 MB
Shape,"(31411, 96, 2)","(7305, 96, 2)"
Count,20 Tasks,5 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 48.25 MB 11.22 MB Shape (31411, 96, 2) (7305, 96, 2) Count 20 Tasks 5 Chunks Type float64 numpy.ndarray",2  96  31411,

Unnamed: 0,Array,Chunk
Bytes,48.25 MB,11.22 MB
Shape,"(31411, 96, 2)","(7305, 96, 2)"
Count,20 Tasks,5 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,96.49 MB,22.44 MB
Shape,"(31411, 192, 2)","(7305, 192, 2)"
Count,20 Tasks,5 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 96.49 MB 22.44 MB Shape (31411, 192, 2) (7305, 192, 2) Count 20 Tasks 5 Chunks Type float64 numpy.ndarray",2  192  31411,

Unnamed: 0,Array,Chunk
Bytes,96.49 MB,22.44 MB
Shape,"(31411, 192, 2)","(7305, 192, 2)"
Count,20 Tasks,5 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,2.32 GB,538.58 MB
Shape,"(31411, 96, 192)","(7305, 96, 192)"
Count,15 Tasks,5 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 2.32 GB 538.58 MB Shape (31411, 96, 192) (7305, 96, 192) Count 15 Tasks 5 Chunks Type float32 numpy.ndarray",192  96  31411,

Unnamed: 0,Array,Chunk
Bytes,2.32 GB,538.58 MB
Shape,"(31411, 96, 192)","(7305, 96, 192)"
Count,15 Tasks,5 Chunks
Type,float32,numpy.ndarray


## Check the size (MB) of our dataset

In [32]:
dset.nbytes / 1e6

2461.368272

Our dataset is bit more than 2.4 GB

## Save file from memory to bucket

In [35]:
%%time
dset.load()

CPU times: user 182 µs, sys: 27 µs, total: 209 µs
Wall time: 218 µs


## Save your results to Remote private object storage
- your credentials are in `$HOME/.aws/credentials` 
- check with your instructor to get the secret access key (replace XXX by the right key)

```
[default]
aws_access_key_id=forces2021-work
aws_secret_access_key=XXXXXXXXXXXX
aws_endpoint_url=https://forces2021.uiogeo-apps.sigma2.no/
```

In [38]:
target = s3fs.S3FileSystem(anon=False,
      client_kwargs={
         'endpoint_url': 'https://forces2021.uiogeo-apps.sigma2.no/'
      })

## Save as netCDF
- netCDF is not a cloud-optimized format so it may be slow

In [39]:
s3_path =  "s3://work/annefou/tasmin_day_MPI-ESM1-2-LR_G6sulfur_r1i1p1f1_gn.nc"
print(s3_path)

s3://work/annefou/tasmin_day_MPI-ESM1-2-LR_G6sulfur_r1i1p1f1_gn.nc


In [40]:
with target.open(s3_path, 'wb') as f:
    f.write(dset.to_netcdf(None))



## Then you can use the remote file

In [55]:
remote_file = ['work/annefou/tasmin_day_MPI-ESM1-2-LR_G6sulfur_r1i1p1f1_gn.nc']

In [56]:
fileset = [target.open(file) for file in remote_file]

In [57]:
%%time
ds_check = xr.open_mfdataset(fileset, combine='by_coords', use_cftime=True)
ds_check

CPU times: user 20.8 s, sys: 4.15 s, total: 24.9 s
Wall time: 51.1 s


Unnamed: 0,Array,Chunk
Bytes,96.49 MB,96.49 MB
Shape,"(31411, 192, 2)","(31411, 192, 2)"
Count,2 Tasks,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 96.49 MB 96.49 MB Shape (31411, 192, 2) (31411, 192, 2) Count 2 Tasks 1 Chunks Type float64 numpy.ndarray",2  192  31411,

Unnamed: 0,Array,Chunk
Bytes,96.49 MB,96.49 MB
Shape,"(31411, 192, 2)","(31411, 192, 2)"
Count,2 Tasks,1 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,2.32 GB,2.32 GB
Shape,"(31411, 96, 192)","(31411, 96, 192)"
Count,2 Tasks,1 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 2.32 GB 2.32 GB Shape (31411, 96, 192) (31411, 96, 192) Count 2 Tasks 1 Chunks Type float32 numpy.ndarray",192  96  31411,

Unnamed: 0,Array,Chunk
Bytes,2.32 GB,2.32 GB
Shape,"(31411, 96, 192)","(31411, 96, 192)"
Count,2 Tasks,1 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,48.25 MB,48.25 MB
Shape,"(31411, 96, 2)","(31411, 96, 2)"
Count,2 Tasks,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 48.25 MB 48.25 MB Shape (31411, 96, 2) (31411, 96, 2) Count 2 Tasks 1 Chunks Type float64 numpy.ndarray",2  96  31411,

Unnamed: 0,Array,Chunk
Bytes,48.25 MB,48.25 MB
Shape,"(31411, 96, 2)","(31411, 96, 2)"
Count,2 Tasks,1 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,502.58 kB,502.58 kB
Shape,"(31411, 2)","(31411, 2)"
Count,2 Tasks,1 Chunks
Type,object,numpy.ndarray
"Array Chunk Bytes 502.58 kB 502.58 kB Shape (31411, 2) (31411, 2) Count 2 Tasks 1 Chunks Type object numpy.ndarray",2  31411,

Unnamed: 0,Array,Chunk
Bytes,502.58 kB,502.58 kB
Shape,"(31411, 2)","(31411, 2)"
Count,2 Tasks,1 Chunks
Type,object,numpy.ndarray


In [58]:
%%time
ds_seas = ds_check.groupby('time.season').mean('time', keep_attrs=True, skipna = True)

CPU times: user 227 ms, sys: 2.54 ms, total: 229 ms
Wall time: 246 ms


## Save as Zarr
- it usually takes longer to save but it is much faster to read

In [51]:
dset.load()

In [49]:
s3_path =  "s3://work/annefou/tasmin_day_MPI-ESM1-2-LR_G6sulfur_r1i1p1f1_gn.zarr"
print(s3_path)

s3://work/annefou/tasmin_day_MPI-ESM1-2-LR_G6sulfur_r1i1p1f1_gn.zarr


In [52]:
store = s3fs.S3Map(root=s3_path, s3=target, check=False)

In [53]:
%%time
dset.to_zarr(store=store, mode="w", consolidated=True, compute=True)

CPU times: user 35.2 s, sys: 6.62 s, total: 41.8 s
Wall time: 1min


<xarray.backends.zarr.ZarrStore at 0x7f651e2a3040>

## Then you can use the remote file
- loading Zarr is usually faster, especially with large datasets

In [54]:
%%time
ds_check = xr.open_zarr(store=store, consolidated=True)
ds_check

CPU times: user 90.7 ms, sys: 5.84 ms, total: 96.5 ms
Wall time: 697 ms


Unnamed: 0,Array,Chunk
Bytes,48.25 MB,753.98 kB
Shape,"(31411, 96, 2)","(3927, 24, 1)"
Count,65 Tasks,64 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 48.25 MB 753.98 kB Shape (31411, 96, 2) (3927, 24, 1) Count 65 Tasks 64 Chunks Type float64 numpy.ndarray",2  96  31411,

Unnamed: 0,Array,Chunk
Bytes,48.25 MB,753.98 kB
Shape,"(31411, 96, 2)","(3927, 24, 1)"
Count,65 Tasks,64 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,96.49 MB,1.51 MB
Shape,"(31411, 192, 2)","(3927, 48, 1)"
Count,65 Tasks,64 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 96.49 MB 1.51 MB Shape (31411, 192, 2) (3927, 48, 1) Count 65 Tasks 64 Chunks Type float64 numpy.ndarray",2  192  31411,

Unnamed: 0,Array,Chunk
Bytes,96.49 MB,1.51 MB
Shape,"(31411, 192, 2)","(3927, 48, 1)"
Count,65 Tasks,64 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,2.32 GB,2.26 MB
Shape,"(31411, 96, 192)","(1964, 12, 24)"
Count,1025 Tasks,1024 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 2.32 GB 2.26 MB Shape (31411, 96, 192) (1964, 12, 24) Count 1025 Tasks 1024 Chunks Type float32 numpy.ndarray",192  96  31411,

Unnamed: 0,Array,Chunk
Bytes,2.32 GB,2.26 MB
Shape,"(31411, 96, 192)","(1964, 12, 24)"
Count,1025 Tasks,1024 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,502.58 kB,251.30 kB
Shape,"(31411, 2)","(15706, 2)"
Count,3 Tasks,2 Chunks
Type,datetime64[ns],numpy.ndarray
"Array Chunk Bytes 502.58 kB 251.30 kB Shape (31411, 2) (15706, 2) Count 3 Tasks 2 Chunks Type datetime64[ns] numpy.ndarray",2  31411,

Unnamed: 0,Array,Chunk
Bytes,502.58 kB,251.30 kB
Shape,"(31411, 2)","(15706, 2)"
Count,3 Tasks,2 Chunks
Type,datetime64[ns],numpy.ndarray


In [59]:
%%time
ds_seas = ds_check.groupby('time.season').mean('time', keep_attrs=True, skipna = True)

CPU times: user 186 ms, sys: 2.35 ms, total: 188 ms
Wall time: 201 ms
