Source code for sertit.s3

# Copyright 2026, SERTIT-ICube - France, https://sertit.unistra.fr/
# This file is part of sertit-utils project
#     https://github.com/sertit/sertit-utils
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
S3 tools
"""

import logging
import os
from contextlib import contextmanager
from functools import wraps

from cloudpathlib import S3Client

from sertit.logs import SU_NAME

LOGGER = logging.getLogger(SU_NAME)

AWS_ACCESS_KEY_ID = "AWS_ACCESS_KEY_ID"
"""
Environment variable linked to AWS Access Key ID.
"""

AWS_SECRET_ACCESS_KEY = "AWS_SECRET_ACCESS_KEY"
"""
Environment variable linked to AWS Secret Access Key.
"""

AWS_S3_ENDPOINT = "AWS_S3_ENDPOINT"
"""
Environment variable linked to AWS endpoint.
"""

USE_S3_STORAGE = "USE_S3_STORAGE"
"""
Environment variable created to use Unistra's S3 bucket.
"""


[docs] def in_house_s3_configs(): """ This function declares in-house S3 configurations for rasterio, pyogrio and cloudpathlib. You can control how rasterio, pyogrio and cloudpathlib connects to S3 server by changing the input configurations bellow. Then, configuration files are translated for each library into their own configuration system. To get more information read the docstring of the function args_dict. Returns: """ import rasterio args_rasterio_env = rasterio.env.getenv() if rasterio.env.hasenv() else {} args_rasterio_l = [ {"kwargs": "profile_name"}, {"name": "AWS_S3_ENDPOINT", "kwargs": "endpoint", "env": "AWS_S3_ENDPOINT"}, {"name": "CPL_AWS_CREDENTIALS_FILE", "env": "AWS_SHARED_CREDENTIALS_FILE"}, {"name": "AWS_CONFIG_FILE", "env": "AWS_CONFIG_FILE"}, { "name": "CPL_CURL_VERBOSE", "default": args_rasterio_env.get("CPL_CURL_VERBOSE", False), }, { "name": "GDAL_DISABLE_READDIR_ON_OPEN", "default": args_rasterio_env.get("GDAL_DISABLE_READDIR_ON_OPEN", False), }, {"name": "AWS_NO_SIGN_REQUEST", "kwargs": "no_sign_request"}, {"name": "AWS_REQUEST_PAYER", "kwargs": "requester_pays"}, ] args_pyogrio_l = [ {"name": "AWS_PROFILE", "kwargs": "profile_name", "env": "AWS_PROFILE"}, {"name": "AWS_S3_ENDPOINT", "kwargs": "endpoint", "env": "AWS_S3_ENDPOINT"}, {"name": "CPL_AWS_CREDENTIALS_FILE", "env": "AWS_SHARED_CREDENTIALS_FILE"}, {"name": "AWS_CONFIG_FILE", "env": "AWS_CONFIG_FILE"}, {"name": "AWS_NO_SIGN_REQUEST", "kwargs": "no_sign_request"}, {"name": "AWS_REQUEST_PAYER", "kwargs": "requester_pays"}, ] s3_client_args_l = [ { "name": "endpoint_url", "kwargs": "endpoint", "env": "AWS_S3_ENDPOINT", "prefix": "https://", }, {"kwargs": "aws_access_key_id", "env": "AWS_ACCESS_KEY_ID"}, {"kwargs": "aws_secret_access_key", "env": "AWS_SECRET_ACCESS_KEY"}, {"kwargs": "requester_pays"}, {"kwargs": "no_sign_request", "default": False}, {"kwargs": "profile_name", "env": "AWS_S3_PROFILE"}, {"kwargs": "aws_session_token"}, {"kwargs": "botocore_session"}, {"kwargs": "boto3_session"}, {"kwargs": "file_cache_mode"}, {"kwargs": "local_cache_dir"}, {"kwargs": "boto3_transfer_config"}, {"kwargs": "content_type_method"}, {"kwargs": "extra_args"}, ] return { "cloudpathlib": s3_client_args_l, "pyogrio": args_pyogrio_l, "rasterio": args_rasterio_l, }
[docs] def s3_env(*args, **kwargs): """ Create S3 compatible storage environment You need to set endpoint url if you use s3 compatible storage since GDAL/Rasterio does not read endpoint url from config file. This function searches for S3 configuration in many places. It does apply configuration variables precedence, and you might have a use for it. Here is the order of precedence from least to greatest (the last listed configuration variables override all other variables): 1. AWS profile 2. Given arguments 3. AWS environment variable Returns: Callable: decorated function Example: >>> from sertit.s3 import s3_env >>> from sertit import AnyPath >>> @s3_env(endpoint="s3.unistra.fr") >>> def file_exists(path: str): >>> pth = AnyPath(path) >>> print(pth.exists()) >>> file_exists("s3://sertit-geodatastore/GLOBAL/COPDEM_30m/COPDEM_30m.vrt") True """ import rasterio from pyogrio import set_gdal_config_options use_s3 = kwargs.get("use_s3_env_var", USE_S3_STORAGE) args = s3_args(*args, **kwargs) args_rasterio = args["rasterio"] args_pyogrio = args["pyogrio"] def decorator(function): @wraps(function) def s3_env_wrapper(*_args, **_kwargs): """S3 environment wrapper""" if int(os.getenv(use_s3, 1)): # Define S3 client for S3 paths define_s3_client(**kwargs) os.environ[use_s3] = "1" LOGGER.info("Using S3 files") set_gdal_config_options(args_pyogrio) with rasterio.Env(**args_rasterio): return function(*_args, **_kwargs) else: os.environ[use_s3] = "0" LOGGER.info("Using on disk files") return function(*_args, **_kwargs) return s3_env_wrapper return decorator
[docs] @contextmanager def temp_s3( endpoint: str = None, profile_name: str = None, requester_pays: bool = False, no_sign_request: bool = False, **kwargs, ) -> None: """ Initialize a temporary S3 environment as a context manager You need to set endpoint url if you use s3 compatible storage since GDAL/Rasterio does not read endpoint url from config file. This function searches for S3 configuration in many places. It does apply configuration variables precedence, and you might have a use for it. Here is the order of precedence from least to greatest (the last listed configuration variables override all other variables): 1. AWS profile 2. Given arguments 3. AWS environment variable Args: endpoint: Endpoint to s3 path in the form s3.yourdomain.com profile_name: The name of your AWS profile requester_pays (bool): True if the endpoint says 'requester pays' no_sign_request (bool): True if the endpoint is open access Example: >>> from sertit.s3 import temp_s3 >>> from sertit import AnyPath >>> def file_exists(path: str): >>> with temp_s3(endpoint="s3.unistra.fr"): >>> pth = AnyPath(path) >>> print(pth.exists()) >>> file_exists("s3://sertit-geodatastore/GLOBAL/COPDEM_30m/COPDEM_30m.vrt") True """ import rasterio from pyogrio import set_gdal_config_options kwargs_cp = kwargs.copy() kwargs_cp["endpoint"] = endpoint kwargs_cp["profile_name"] = profile_name kwargs_cp["requester_pays"] = requester_pays kwargs_cp["no_sign_request"] = no_sign_request args = s3_args(**kwargs_cp) args_rasterio = args["rasterio"] args_pyogrio = args["pyogrio"] # Define S3 client for S3 paths try: set_gdal_config_options(args_pyogrio) with rasterio.Env(**args_rasterio): yield define_s3_client( endpoint=endpoint, profile_name=profile_name, requester_pays=requester_pays, no_sign_request=no_sign_request, **kwargs, ) finally: # Clean env S3Client().set_as_default_client()
[docs] def define_s3_client( endpoint=None, profile_name=None, requester_pays: bool = False, no_sign_request: bool = False, **kwargs, ): """ Define S3 client This function searches for S3 configuration in many places. It does apply configuration variables precedence, and you might have a use for it. Here is the order of precedence from least to greatest (the last listed configuration variables override all other variables): 1. AWS profile 2. Given arguments 3. AWS environment variable Args: endpoint: The s3 endpoint (s3.yourdomain.com) profile_name: The name of the aws profile. Default to default profile in AWS configuration file. requester_pays (bool): True if the endpoint says 'requester pays' no_sign_request (bool): True if the endpoint is open access """ kwargs_cp = kwargs.copy() kwargs_cp["endpoint"] = endpoint kwargs_cp["profile_name"] = profile_name kwargs_cp["requester_pays"] = requester_pays kwargs_cp["no_sign_request"] = no_sign_request args = s3_args(**kwargs_cp) args_s3_client = args["cloudpathlib"] client = S3Client(**args_s3_client) client.set_as_default_client()
[docs] def s3_args(*args, **kwargs) -> dict: """ This function returns ready to use configurations for rasterio, pyogrio and cloudpathlib. For each in-house input configurations, it applies the function args_dict. Args: *args: **kwargs: Returns: """ import rasterio in_house_configs = in_house_s3_configs() s3_client_args_l = in_house_configs["cloudpathlib"] args_pyogrio_l = in_house_configs["pyogrio"] args_rasterio_l = in_house_configs["rasterio"] s3_client_args = args_dict(s3_client_args_l, kwargs) args_pyogrio = args_dict(args_pyogrio_l, kwargs) args_rasterio = args_dict(args_rasterio_l, kwargs) args_rasterio_env = rasterio.env.getenv() if rasterio.env.hasenv() else {} args_rasterio_env.update(args_rasterio) return { "cloudpathlib": s3_client_args, "pyogrio": args_pyogrio, "rasterio": args_rasterio, }
[docs] def args_dict(args_l: list[dict], kwargs) -> dict: """ This function converts a single in-house S3 configuration to a dictionary containing ready to use key/value parameters. The input is a list of dict to process. Each dictionary can contain the following key: - name: The name of the key in the output dict. If not given, the name of the key is the value of kwargs. - kwargs: The name of the kwargs argument whose value is taken to set the value in output dict. - env: The name of the envrionment variable whose value is taken to set the value in output dict. - prefix: A prefix prefixed to the value in the ouput dict. For example, if the environment variable "AWS_PROFILE" is unset, the following input: args_l = [{"name": "AWS_PROFILE", "kwargs": "profile_name", "env": "AWS_PROFILE"}], kwargs={"profile_name": "unistra"} will output: {"AWS_PROFILE": "unistra"} Here is the order of precedence from least to greatest (the last listed configuration variables override all other variables): 1. Value from kwargs. 2. Default value from key "default". 3. Value from environment variable. If no value is found, the output dict will not contain the wanted parameter. Args: args_l: A list of parameters to extract. Each element will give a single key/value parameter. kwargs: Considered kwargs input to set parameters. Returns: """ ret = {} for arg in args_l: arg_name = arg["name"] if "name" in arg else arg["kwargs"] # First, set with default value arg_value = arg.get("default") if arg_value is not None: ret[arg_name] = arg_value # Override with kwargs arg_value = kwargs.get(arg["kwargs"]) if arg.get("kwargs") is not None else None if arg_value is not None: ret[arg_name] = arg_value # Override with environment variable arg_value = os.getenv(arg.get("env")) if arg.get("env") is not None else None if arg_value is not None and arg_value != "": ret[arg_name] = arg_value if arg.get("prefix") is not None and ret.get(arg_name) is not None: ret[arg_name] = arg.get("prefix") + ret[arg_name] # Some exceptions if ret.get("AWS_NO_SIGN_REQUEST") is not None: ret["AWS_NO_SIGN_REQUEST"] = "YES" if ret["AWS_NO_SIGN_REQUEST"] else "NO" if ret.get("AWS_REQUEST_PAYER") is not None: ret["AWS_REQUEST_PAYER"] = "requester" if ret["AWS_REQUEST_PAYER"] else None if ret.get("requester_pays") is not None: if ret.get("extra_args") is not None: ret["extra_args"].update({"RequestPayer": "requester"}) else: ret["extra_args"] = {"RequestPayer": "requester"} ret.pop("requester_pays") return ret