Source code for nowcast.workers.make_averaged_dataset

#  Copyright 2013 – present by the SalishSeaCast Project contributors
#  and The University of British Columbia
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.

# SPDX-License-Identifier: Apache-2.0


"""SalishSeaCast worker that creates a down-sampled time-series dataset netCDF4 file from
another model product file using the Reshapr API.
"""
# Intended uses:
#
# * create day-averaged datasets from hour-averaged NEMO output files as a model run
#   post-processing step
# * create month-averaged datasets from day-averaged at the end of each month of running
import logging
import os
from pathlib import Path

import arrow
import structlog
from nemo_nowcast import NowcastWorker, WorkerError
import reshapr.api.v1.extract
import xarray
from tenacity import retry, stop_after_attempt, wait_random, retry_if_exception_type

NAME = "make_averaged_dataset"
logger = logging.getLogger(NAME)


[docs] def main(): """For command-line usage see: :command:`python -m nowcast.workers.make_averaged_dataset --help` """ _configure_structlog() worker = NowcastWorker(NAME, description=__doc__) worker.init_cli() worker.cli.add_argument( "avg_time_interval", choices={"day", "month"}, help="Time interval over which to average the dataset", ) worker.cli.add_argument( "reshapr_var_group", choices={"biology", "chemistry", "physics"}, help="Dataset variable group to run extraction for", ) worker.cli.add_date_option( "--run-date", default=arrow.now().floor("day"), help=""" Date of the run to calculate the day-averaged dataset for, or start date of the collection of daily runs to calculate the month-averaged dataset for. """, ) worker.run(make_averaged_dataset, success, failure) return worker
def _configure_structlog(): """Configure structlog (used by Reshapr) to pass a formatted log message string to the :py:mod:`logging`. ref: https://www.structlog.org/en/latest/standard-library.html#rendering-within-structlog """ structlog.configure( processors=[ # If log level is too low, abort pipeline and throw away log entry. structlog.stdlib.filter_by_level, # If the "stack_info" key in the event dict is true, remove it and # render the current stack trace in the "stack" key. structlog.processors.StackInfoRenderer(), # If the "exc_info" key in the event dict is either true or a # sys.exc_info() tuple, remove "exc_info" and render the exception # with traceback into the "exception" key. structlog.processors.format_exc_info, # If some value is in bytes, decode it to a unicode str. structlog.processors.UnicodeDecoder(), # Render the final event dict nicely aligned and ordered, but without colours # because we're generally logging to files. structlog.dev.ConsoleRenderer(colors=False), ], # ``wrapper_class`` is the bound logger that you get back from # get_logger(). This one imitates the API of ``logging.Logger``. wrapper_class=structlog.stdlib.BoundLogger, # ``logger_factory`` is used to create wrapped loggers that are used for # OUTPUT. This one returns a ``logging.Logger``. The final value (a string) # from the final processor (``ConsoleRenderer``) will be passed to # the method of the same name as that called on the bound logger. logger_factory=structlog.stdlib.LoggerFactory(), # Effectively freeze configuration after creating the first bound logger. cache_logger_on_first_use=True, ) def success(parsed_args): avg_time_interval = parsed_args.avg_time_interval run_date = parsed_args.run_date reshapr_var_group = parsed_args.reshapr_var_group match avg_time_interval: case "day": logger.info( f"{avg_time_interval}-averaged dataset for {run_date.format('DD-MMM-YYYY')} " f"{reshapr_var_group} created" ) case "month": logger.info( f"{avg_time_interval}-averaged dataset for {run_date.format('MMM-YYYY')} " f"{reshapr_var_group} created" ) msg_type = f"success {avg_time_interval} {reshapr_var_group}" return msg_type def failure(parsed_args): avg_time_interval = parsed_args.avg_time_interval run_date = parsed_args.run_date reshapr_var_group = parsed_args.reshapr_var_group match avg_time_interval: case "day": logger.critical( f"{avg_time_interval}-averaged dataset for {run_date.format('DD-MMM-YYYY')} " f"{reshapr_var_group} creation failed" ) case "month": logger.critical( f"{avg_time_interval}-averaged dataset for {run_date.format('MMM-YYYY')} " f"{reshapr_var_group} creation failed" ) msg_type = f"failure {avg_time_interval} {reshapr_var_group}" return msg_type def make_averaged_dataset(parsed_args, config, *args): avg_time_interval = parsed_args.avg_time_interval reshapr_var_group = parsed_args.reshapr_var_group run_date = parsed_args.run_date if avg_time_interval == "month" and run_date.day != 1: logger.error( f"Month-averaging must start on the first day of a month but " f"run_date = {run_date.format('YYYY-MM-DD')}" ) raise WorkerError reshapr_config_dir = Path(config["averaged datasets"]["reshapr config dir"]) reshapr_config_yaml = config["averaged datasets"][avg_time_interval][ reshapr_var_group ]["reshapr config"] match avg_time_interval: case "day": logger.info( f"creating {avg_time_interval}-averaged dataset for " f"{run_date.format('DD-MMM-YYYY')} {reshapr_var_group}" ) reshapr_config = reshapr.api.v1.extract.load_extraction_config( reshapr_config_dir / reshapr_config_yaml, run_date, run_date ) dest_dir = Path(reshapr_config["extracted dataset"]["dest dir"]) ddmmmyy = run_date.format("DDMMMYY").lower() reshapr_config["extracted dataset"]["dest dir"] = dest_dir / ddmmmyy case "month": logger.info( f"creating {avg_time_interval}-averaged dataset for " f"{run_date.format('MMM-YYYY')} {reshapr_var_group}" ) start_date = run_date end_date = run_date.shift(months=+1, days=-1) reshapr_config = reshapr.api.v1.extract.load_extraction_config( reshapr_config_dir / reshapr_config_yaml, start_date, end_date ) nc_path = _extract_netcdf(reshapr_config, reshapr_config_yaml) nc_path.chmod(0o664) if avg_time_interval == "day": file_pattern = config["averaged datasets"][avg_time_interval][ reshapr_var_group ]["file pattern"] dest_nc_filename = file_pattern.format(yyyymmdd=run_date.format("YYYYMMDD")) nc_path = nc_path.rename(nc_path.with_name(dest_nc_filename)) return { f"{avg_time_interval} {reshapr_var_group}": { "run date": run_date.format("YYYY-MM-DD"), "file path": os.fspath(nc_path), } } @retry( retry=retry_if_exception_type((WorkerError, OSError, RuntimeError)), reraise=True, stop=stop_after_attempt(5), wait=wait_random(min=5, max=10), ) def _extract_netcdf(reshapr_config, reshapr_config_yaml): nc_path = reshapr.api.v1.extract.extract_netcdf(reshapr_config, reshapr_config_yaml) if not nc_path.exists(): logger.error(f"reshapr extraction failed for {os.fspath(nc_path)}") raise WorkerError() nc_path_size = nc_path.stat().st_size if nc_path_size < 1_000_000: logger.error( f"reshapr extraction failed: {os.fspath(nc_path)} is too small: {nc_path_size}" ) raise WorkerError() try: xarray.open_dataset(nc_path) except OSError as exc: logger.error(exc) raise WorkerError return nc_path if __name__ == "__main__": main() # pragma: no cover