Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .git-blame-ignore-revs
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,5 @@
235927ce73125df31ec3a0049b067afa1f0a135b
25eefd52d023f97870d8b4a27988f8fc91c3ed73
7d8c46cce63ff1b93836b72cdad74ec796b09ced
e917dfbe72cb0cccafb38fb0694f7fe9dff4c158

28 changes: 14 additions & 14 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,30 +1,30 @@
# inputdataTools
rimport used for publishing inputdata
relink.py used to remove files from $CESMDATAROOT and replace with links.
Tools used for publishing CESM input data.

Process to publish data:
The first step is to place your datafile(s) in /glade/campaign/cesm/cesmdata/inputdata following the inputdata naming convention.
When you have tested on derecho and are ready to share the new file(s) publically:
## Process to publish data
1. Place your datafile(s) in `/glade/campaign/cesm/cesmdata/inputdata/` (`$CESMDATAROOT`) following the input data naming conventions (see below).
2. When you have tested on derecho and are ready to share the new file(s) publically, run the `rimport` script. This will ask you for a password and 2FA login before it can copy the files from this "input data" directory" to the "publication" or "staging" directory. (This authentication should be possible for any member of the `cseg` group.)
3. Once that's done, `rimport` will replace the original with a link to the copy.
4. Sometime in the next 24 hours, your file should be uploaded to the GDEX server and available for download during CESM runs.

As user cesmdata run the rimport script. This requires a 2FA login, everyone in cseg should have access to the account, you will need to
contact cislhelp and request access if you are new to the group.
Notes:
- Use `rimport --check` if you'd like to see the current status of a file, including whether it's available for download.
- The `relink.py` script was previously used for step 3 above, but that functionality is now built into `rimport`. It's still there if you want to use it by itself.

As owner of the files in /glade/campaign/cesm/cesmdata/inputdata run script relink.py, this will remove the files from /glade/campaign/cesm/cesmdata/inputdata
and replace them with links to the published data location. /glade/campaign/collections/gdex/data/d651077/cesmdata/inputdata/

Filenames and metadata:
## Filenames and metadata:

There is a good description of metadata that should be included in inputdata files here: https://www.cesm.ucar.edu/models/cam/metadata

Filenames should be descriptive and should contain the date the file was created. Other information in the filename is also useful to keep as shown in the list below. Files published in inputdata should never be overwritten.

Replacement files should be different at least by creation date. Files that come from CESM simualtions should normally follow the output naming conventions from https://www.cesm.ucar.edu/models/cesm2/naming-conventions#modelOutputFilenames

Files should be placed under the appropriate directory for the component it's used or applicable for (so under lnd/clm2 for data that applies to the CLM/CTSM land model). Subdirectories under those levels should be used to seperate data by general types as needed for that component.
Files should be placed under the appropriate directory for the component it's used or applicable for (so under `lnd/clm2/` for data that applies to the CLM/CTSM land model). Subdirectories under those levels should be used to seperate data by general types as needed for that component.

Some suggestions on things to include in the filename:
- Spatial resolution of the gridded data
- Year (or years) for which the data was observed or applicable to
- Institution or project source of the data
- Creation date in the form of _cMMDDYY.nc
- Creation date in the form of `_cMMDDYY.nc`
- CESM casename that was used to create the data (also simulation date for it) (see output file naming conventions above)
- Things needed to distinquish it from other similar files in inputdata (i.e. things like number of vertical levels, land-mask, number of Plant Functional Types, etc.)
- Things needed to distinquish it from other similar input files (e.g., number of vertical levels, land mask, number of Plant Functional Types, etc.)
11 changes: 8 additions & 3 deletions relink.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,6 +222,7 @@ def replace_files_with_symlinks(
for file_path in find_owned_files_scandir(
item_to_process, user_uid, inputdata_root
):
logger.info("'%s':", file_path)
replace_one_file_with_symlink(
inputdata_root, target_dir, file_path, dry_run=dry_run
)
Expand All @@ -238,7 +239,6 @@ def replace_one_file_with_symlink(inputdata_root, target_dir, file_path, dry_run
file_path (str): The path of the file to be replaced.
dry_run (bool): If True, only show what would be done without making changes.
"""
logger.info("'%s':", file_path)

# Determine the relative path and the new link's destination
relative_path = os.path.relpath(file_path, inputdata_root)
Expand Down Expand Up @@ -270,7 +270,9 @@ def replace_one_file_with_symlink(inputdata_root, target_dir, file_path, dry_run
os.rename(link_name, link_name + ".tmp")
logger.info("%sDeleted original file: %s", INDENT, link_name)
except OSError as e:
logger.error("%sError deleting file %s: %s. Skipping.", INDENT, link_name, e)
logger.error(
"%sError deleting file %s: %s. Skipping relink.", INDENT, link_name, e
)
return

# Create the symbolic link, handling necessary parent directories
Expand All @@ -283,7 +285,10 @@ def replace_one_file_with_symlink(inputdata_root, target_dir, file_path, dry_run
except OSError as e:
os.rename(link_name + ".tmp", link_name)
logger.error(
"%sError creating symlink for %s: %s. Skipping.", INDENT, link_name, e
"%sError creating symlink for %s: %s. Skipping relink.",
INDENT,
link_name,
e,
)


Expand Down
53 changes: 36 additions & 17 deletions rimport
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,14 @@
# TODO: Move all the Python into new file rimport.py for simpler testing. Keep rimport as a
# convenience wrapper.
"""
Copy files from CESM inputdata directory to a publishing directory.
Copy files from CESM inputdata directory to a publishing directory, then replace the original with a
symlink to the copy.

Do `rimport --help` for more information.
"""
from __future__ import annotations

import argparse
import logging
import os
import pwd
import shutil
Expand All @@ -19,6 +19,8 @@ from typing import Iterable, List
from urllib.request import Request, urlopen
from urllib.error import HTTPError

from relink import replace_one_file_with_symlink

import shared
INDENT = shared.INDENT

Expand All @@ -34,20 +36,13 @@ logger = shared.logger
def build_parser() -> argparse.ArgumentParser:
"""Build and configure the argument parser for rimport.

Creates an ArgumentParser with the following options:
- Mutually exclusive required group:
--file: Import a single file (relative to inputdata directory)
--list: Import multiple files from a list file
- Optional:
--inputdata: Override the default inputdata directory

Returns:
argparse.ArgumentParser: Configured parser ready to parse command-line arguments.
"""
parser = argparse.ArgumentParser(
description=(
f"Copy files from CESM inputdata directory ({DEFAULT_INPUTDATA_ROOT}) to a publishing"
" directory."
" directory, then replace the original with a symlink to the copy."
),
add_help=False, # Disable automatic help to add custom -help flag
)
Expand Down Expand Up @@ -144,10 +139,24 @@ def normalize_paths(root: Path, relnames: Iterable[str]) -> List[Path]:
return paths


def check_relink_worked(src: Path, dst: Path) -> None:
"""Check whether relink worked

Args:
src (Path): Source file (should have been converted to symlink)
dst (Path): Destination file (symlink target)

Raises:
RuntimeError: If src is not a symlink pointing to dst.
"""
if not (src.is_symlink() and src.resolve() == dst):
raise RuntimeError("Error relinking during rimport")


def stage_data(
src: Path, inputdata_root: Path, staging_root: Path, check: bool = False
) -> None:
"""Stage a file by mirroring its path under `staging_root`.
"""Stage a file by mirroring its path under `staging_root`, then replace with symlink to staged.

Destination path is computed by replacing the `inputdata_root` prefix of `src`
with `staging_root`, i.e.:
Expand All @@ -163,6 +172,7 @@ def stage_data(
RuntimeError: If `src` is a live symlink pointing outside staging, or if `src` is outside
the inputdata root, or if `src` is already under staging directory.
RuntimeError: If `src` is a broken symlink.
RuntimeError: If it failed to replace `src` with a symlink to the staged file.
FileNotFoundError: If `src` does not exist.

Guardrails:
Expand Down Expand Up @@ -200,20 +210,25 @@ def stage_data(
dst = staging_root / rel

if dst.exists():
logger.info("%sFile is already published but NOT linked; do", INDENT)
logger.info("%srelink.py %s", 2 * INDENT, rel)
logger.info("%sto resolve.", INDENT)
logger.info("File is already published but NOT linked; linking now.")
replace_one_file_with_symlink(inputdata_root, staging_root, str(src))
print_can_file_be_downloaded(can_file_be_downloaded(rel, staging_root))
check_relink_worked(src, dst)
return

if check:
logger.info("%sFile is not already published", INDENT)
return

# Copy file to destination
dst.parent.mkdir(parents=True, exist_ok=True)
shutil.copy2(src, dst)
logger.info("%s[rimport] staged %s -> %s", INDENT, src, dst)

# Replace original with symlink to destination
replace_one_file_with_symlink(inputdata_root, staging_root, str(src))
check_relink_worked(src, dst)


def ensure_running_as(target_user: str, argv: list[str]) -> None:
"""Ensure the script is running as the target user, re-executing via sudo if needed.
Expand Down Expand Up @@ -354,7 +369,7 @@ def get_files_to_process(file: str, filelist: str, items_to_process: list):
def main(argv: List[str] | None = None) -> int:
"""Main entry point for the rimport tool.

Copies files from the CESM inputdata directory to a staging/publishing directory,
Copies and relinks files from the CESM inputdata directory to a staging/publishing directory,
preserving the directory structure. Ensures the script runs as the correct user
(STAGE_OWNER) and handles both single files and file lists.

Expand All @@ -370,7 +385,7 @@ def main(argv: List[str] | None = None) -> int:

Exit Codes:
0: All files staged successfully.
1: One or more files failed to stage (errors printed to stderr).
1: One or more files failed to stage or relink (errors printed to stderr).
2: Fatal error (missing inputdata directory, missing file list, etc.).
"""
parser = build_parser()
Expand Down Expand Up @@ -410,7 +425,11 @@ def main(argv: List[str] | None = None) -> int:
errors += 1
logger.error("%srimport: error processing %s: %s", INDENT, p, e)

return 0 if errors == 0 else 1
if errors:
return 1
if not args.check:
logger.info("\nNo need to run relink.py")
return 0


if __name__ == "__main__":
Expand Down
40 changes: 36 additions & 4 deletions tests/relink/test_replace_files_with_symlinks.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
from unittest.mock import patch, call
import pytest

import shared

# Add parent directory to path to import relink module
sys.path.insert(
0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
Expand All @@ -20,14 +22,25 @@
import relink # noqa: E402


@pytest.fixture(autouse=True)
def configure_logging_for_tests():
"""Configure logging for all tests in this module."""
shared.configure_logging(logging.INFO)
yield
# Cleanup
relink.logger.handlers.clear()


@pytest.fixture(name="mock_replace_one")
def fixture_mock_replace_one():
"""Fixture that mocks relink.replace_one_file_with_symlink"""
with patch("relink.replace_one_file_with_symlink") as mock:
yield mock


def test_basic_file_replacement_given_dir(temp_dirs, current_user, mock_replace_one):
def test_basic_file_replacement_given_dir(
temp_dirs, current_user, mock_replace_one, caplog
):
"""Test basic functionality: given directory, replace owned file with symlink."""
inputdata_root, target_dir = temp_dirs
username = current_user
Expand Down Expand Up @@ -55,8 +68,13 @@ def test_basic_file_replacement_given_dir(temp_dirs, current_user, mock_replace_
dry_run=False,
)

# Verify message with filename was printed
assert f"'{source_file}':" in caplog.text

def test_basic_file_replacement_given_file(temp_dirs, current_user, mock_replace_one):

def test_basic_file_replacement_given_file(
temp_dirs, current_user, mock_replace_one, caplog
):
"""Test basic functionality: given owned file, replace with symlink."""
inputdata_root, target_dir = temp_dirs
username = current_user
Expand Down Expand Up @@ -84,8 +102,11 @@ def test_basic_file_replacement_given_file(temp_dirs, current_user, mock_replace
dry_run=False,
)

# Verify message with filename was printed
assert f"'{source_file}':" in caplog.text


def test_dry_run(temp_dirs, current_user, mock_replace_one):
def test_dry_run(temp_dirs, current_user, mock_replace_one, caplog):
"""Test that dry_run=True is passed correctly."""
inputdata_root, target_dir = temp_dirs
username = current_user
Expand Down Expand Up @@ -117,6 +138,9 @@ def test_dry_run(temp_dirs, current_user, mock_replace_one):
dry_run=True,
)

# Verify message with filename was printed
assert f"'{source_file}':" in caplog.text


def test_nested_directory_structure(temp_dirs, current_user, mock_replace_one):
"""Test with nested directory structures."""
Expand Down Expand Up @@ -175,6 +199,9 @@ def test_skip_existing_symlinks(temp_dirs, current_user, caplog, mock_replace_on
# Verify replace_one_file_with_symlink() wasn't called
mock_replace_one.assert_not_called()

# Verify message with filename was NOT printed
assert f"'{source_link}':" not in caplog.text


def test_missing_target_file(temp_dirs, current_user, caplog, mock_replace_one):
"""Test behavior when target file doesn't exist."""
Expand Down Expand Up @@ -224,7 +251,7 @@ def test_invalid_username(temp_dirs, caplog, mock_replace_one):
mock_replace_one.assert_not_called()


def test_multiple_files(temp_dirs, current_user, mock_replace_one):
def test_multiple_files(temp_dirs, current_user, mock_replace_one, caplog):
"""Test with multiple files in the directory."""
inputdata_root, target_dir = temp_dirs
username = current_user
Expand All @@ -251,6 +278,11 @@ def test_multiple_files(temp_dirs, current_user, mock_replace_one):
calls.append(call(inputdata_root, target_dir, source_file, dry_run=False))
mock_replace_one.assert_has_calls(calls, any_order=True)

# Verify message with filename was printed
for i in range(5):
source_file = os.path.join(inputdata_root, f"file_{i}.txt")
assert f"'{source_file}':" in caplog.text


def test_multiple_files_nested(temp_dirs, current_user, mock_replace_one):
"""Test with multiple files scattered throughout a nested directory tree."""
Expand Down
7 changes: 3 additions & 4 deletions tests/relink/test_replace_one_file_with_symlink.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ def test_absolute_paths(temp_dirs):
os.chdir(cwd)


def test_print_found_owned_file(temp_dirs, caplog):
def test_no_print_found_owned_file(temp_dirs, caplog):
"""Test that message with filename is printed."""
source_dir, target_dir = temp_dirs

Expand All @@ -135,9 +135,8 @@ def test_print_found_owned_file(temp_dirs, caplog):
with caplog.at_level(logging.INFO):
relink.replace_one_file_with_symlink(source_dir, target_dir, source_file)

# Check that message was logged
assert f"'{source_file}':" in caplog.text
assert source_file in caplog.text
# Check that message was NOT logged (should happen in replace_files_with_symlinks instead)
assert f"'{source_file}':" not in caplog.text


def test_print_deleted_and_created_messages(temp_dirs, caplog):
Expand Down
Loading