Recently, I had to frequently share deep learning model outputs with a client. However, with each dataset being several GB in size, this increasinly became a challenge. By default many outputs, such as binary segmentation results, are stored as floating point. Often, such data can be converted to an integer data type (e.g., uint8) without considerable loss in information. In my case, this reduced the data size to some MB. Because I needed this operation so frequently, I am archiving the relevant script in this post.
from osgeo import gdal
import os
import numpy as np
def write_raster(path, array, reference_dataset, data_type):
driver = gdal.GetDriverByName("GTiff")
out_dataset = driver.Create(path, array.shape[1], array.shape[0], 1, data_type, ['COMPRESS=DEFLATE'])
out_dataset.SetGeoTransform(reference_dataset.GetGeoTransform())
out_dataset.SetProjection(reference_dataset.GetProjection())
out_dataset.GetRasterBand(1).WriteArray(array)
out_dataset.FlushCache()
out_dataset = None
source_dir = "path/to/original/tif/files/"
target_dir = "path/to/converted/tif/files/"
if not os.path.exists(target_dir):
os.makedirs(target_dir)
files = os.listdir(source_dir)
files_tif = [file for file in files if file.endswith("tif")]
for file_tif in files_tif:
file_path = source_dir + file_tif
file_path_binary = target_dir + file_tif
gdal_ds = gdal.Open(file_path)
gdal_arr = gdal_ds.ReadAsArray()
gdal_arr_binary = (gdal_arr > 0.5).astype(np.uint8)
out_file_path = file_path_binary[:-4] + "_binary.tif"
write_raster(out_file_path, gdal_arr_binary, gdal_ds, gdal.GDT_Byte)