From 5fd6259d9f5ee7666d54fbfe2a4d46b597c380ff Mon Sep 17 00:00:00 2001 From: Sid Mohan Date: Sun, 1 Feb 2026 15:21:22 -0800 Subject: [PATCH 1/2] chore(ci): consolidate 7 workflows into 3 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace 5 overlapping release workflows (nightly, beta, weekly, publish-pypi, pre-commit-auto-fix) with a single unified release.yml that chains determine-release โ†’ test โ†’ publish โ†’ cleanup. Tests are now a hard gate for all releases. - Fix .bumpversion.cfg version drift (4.1.1 โ†’ 4.3.0) - Add concurrency controls and consolidate redundant CI test jobs - Add gitleaks secret scanning and pre-commit-hooks to .pre-commit-config.yaml - Add --alpha flag to generate_changelog.py - Update Claude.md to reflect new 3-workflow architecture Co-Authored-By: Claude Opus 4.5 --- .bumpversion.cfg | 2 +- .github/workflows/beta-release.yml | 203 -------------- .github/workflows/ci.yml | 72 +---- .github/workflows/nightly-release.yml | 179 ------------- .github/workflows/pre-commit-auto-fix.yml | 44 --- .github/workflows/publish-pypi.yml | 164 ------------ .github/workflows/release.yml | 312 ++++++++++++++++++++++ .github/workflows/weekly-release.yml | 112 -------- .pre-commit-config.yaml | 13 + Claude.md | 56 ++-- scripts/generate_changelog.py | 13 +- 11 files changed, 381 insertions(+), 789 deletions(-) delete mode 100644 .github/workflows/beta-release.yml delete mode 100644 .github/workflows/nightly-release.yml delete mode 100644 .github/workflows/pre-commit-auto-fix.yml delete mode 100644 .github/workflows/publish-pypi.yml create mode 100644 .github/workflows/release.yml delete mode 100644 .github/workflows/weekly-release.yml diff --git a/.bumpversion.cfg b/.bumpversion.cfg index 3fb34915..463cf894 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 4.1.1 +current_version = 4.3.0 commit = True tag = True tag_name = v{new_version} diff --git a/.github/workflows/beta-release.yml b/.github/workflows/beta-release.yml deleted file mode 100644 index 39c70112..00000000 --- a/.github/workflows/beta-release.yml +++ /dev/null @@ -1,203 +0,0 @@ -name: Beta Release (Thursday) - -on: - schedule: - - cron: '0 2 * * 4' # Thursday at 2 AM UTC - workflow_dispatch: - inputs: - dry_run: - description: 'Dry run (skip PyPI publish)' - required: false - default: 'false' - type: boolean - force_build: - description: 'Force build even if no changes' - required: false - default: 'false' - type: boolean - -jobs: - check-changes: - runs-on: ubuntu-latest - outputs: - has_changes: ${{ steps.changes.outputs.has_changes }} - commit_count: ${{ steps.changes.outputs.commit_count }} - last_beta: ${{ steps.changes.outputs.last_beta }} - steps: - - uses: actions/checkout@v4 - with: - fetch-depth: 0 - ref: dev - - - name: Check for changes since last beta release - id: changes - run: | - LAST_BETA=$(git tag -l "*b*" --sort=-version:refname | head -n1) - - if [ -z "$LAST_BETA" ]; then - echo "No previous beta release found" - COMMIT_COUNT=$(git rev-list --count --since="1 week ago" dev) - else - echo "Last beta release: $LAST_BETA" - COMMIT_COUNT=$(git rev-list --count ${LAST_BETA}..dev) - fi - - echo "commit_count=$COMMIT_COUNT" >> $GITHUB_OUTPUT - echo "last_beta=$LAST_BETA" >> $GITHUB_OUTPUT - - if [ "$COMMIT_COUNT" -gt 0 ] || [ "${{ github.event.inputs.force_build }}" = "true" ]; then - echo "has_changes=true" >> $GITHUB_OUTPUT - else - echo "has_changes=false" >> $GITHUB_OUTPUT - fi - - beta-release: - needs: check-changes - if: needs.check-changes.outputs.has_changes == 'true' - runs-on: ubuntu-latest - outputs: - beta_version: ${{ steps.version.outputs.beta_version }} - steps: - - uses: actions/checkout@v4 - with: - fetch-depth: 0 - ref: dev - token: ${{ secrets.GH_PAT }} - - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: '3.11' - - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install bump2version build twine psutil - pip install -e ".[all,dev]" - # Install memory monitoring tools - pip install memory_profiler - - - name: Configure git - run: | - git config --local user.email "action@github.com" - git config --local user.name "GitHub Action" - - - name: Generate beta version - id: version - run: | - set -e - - # Fetch all tags to ensure we have the complete tag history - git fetch --tags - - CURRENT_VERSION=$(python -c "from datafog.__about__ import __version__; print(__version__)") - echo "Current version: $CURRENT_VERSION" - - # Extract base version (remove any alpha/beta suffix) - if [[ $CURRENT_VERSION == *"b"* ]]; then - BASE_VERSION=$(echo $CURRENT_VERSION | cut -d'b' -f1) - elif [[ $CURRENT_VERSION == *"a"* ]]; then - BASE_VERSION=$(echo $CURRENT_VERSION | cut -d'a' -f1) - else - BASE_VERSION=$CURRENT_VERSION - fi - - echo "Base version: $BASE_VERSION" - - # Find the next available beta version by checking existing tags - BETA_NUM=1 - while git tag -l "v${BASE_VERSION}b${BETA_NUM}" | grep -q "v${BASE_VERSION}b${BETA_NUM}"; do - echo "Tag v${BASE_VERSION}b${BETA_NUM} already exists" - BETA_NUM=$((BETA_NUM + 1)) - done - - BETA_VERSION="${BASE_VERSION}b${BETA_NUM}" - echo "Next available beta version: $BETA_VERSION" - - echo "beta_version=$BETA_VERSION" >> $GITHUB_OUTPUT - sed -i "s/__version__ = \".*\"/__version__ = \"$BETA_VERSION\"/" datafog/__about__.py - - - name: Generate changelog - run: | - python scripts/generate_changelog.py --beta --output BETA_CHANGELOG.md - - - name: Run tests with segfault protection - env: - # Memory optimization environment variables (set by run_tests.py) - CI: true - GITHUB_ACTIONS: true - run: | - # Print system memory info - free -h || echo "free command not available" - - # Use our robust test runner that handles segfaults - echo "Running main tests with segfault protection..." - python run_tests.py tests/ -k "not benchmark and not integration" --no-header - - # Run integration tests separately with segfault protection - echo "Running integration tests..." - python run_tests.py -m integration --no-header - - # Run simple performance validation (no pytest-benchmark dependency) - echo "Running simple performance validation..." - OMP_NUM_THREADS=4 MKL_NUM_THREADS=4 OPENBLAS_NUM_THREADS=4 python tests/simple_performance_test.py - - - name: Build package - run: | - python -m build - python scripts/check_wheel_size.py - - - name: Create GitHub release - env: - GITHUB_TOKEN: ${{ secrets.GH_PAT }} - run: | - BETA_VERSION="${{ steps.version.outputs.beta_version }}" - git add datafog/__about__.py - git commit -m "chore: bump version to $BETA_VERSION for beta release" - git tag -a "v$BETA_VERSION" -m "Beta release $BETA_VERSION" - git push origin "v$BETA_VERSION" - - gh release create "v$BETA_VERSION" \ - --title "๐Ÿšง Beta Release $BETA_VERSION" \ - --notes-file BETA_CHANGELOG.md \ - --prerelease \ - --target dev \ - dist/* - - - name: Publish to PyPI - if: github.event.inputs.dry_run != 'true' - env: - TWINE_USERNAME: __token__ - TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }} - run: | - python -m twine upload dist/* --verbose - - - name: Dry run summary - if: github.event.inputs.dry_run == 'true' - run: | - echo "๐Ÿƒ DRY RUN COMPLETE" - echo "Would have published: ${{ steps.version.outputs.beta_version }}" - ls -la dist/ - - - name: Cleanup old betas - env: - GITHUB_TOKEN: ${{ secrets.GH_PAT }} - run: | - BETA_RELEASES=$(gh release list --limit 30 | grep b | tail -n +6 | cut -f3) - - for release in $BETA_RELEASES; do - echo "Deleting $release" - gh release delete "$release" --yes || true - git push --delete origin "$release" || true - done - - notify-beta: - needs: [check-changes, beta-release] - if: needs.check-changes.outputs.has_changes == 'true' && success() - runs-on: ubuntu-latest - steps: - - name: Beta release notification - run: | - echo "๐Ÿšง Beta release completed!" - echo "Install: pip install datafog==${{ needs.beta-release.outputs.beta_version }}" - echo "Commits since last beta: ${{ needs.check-changes.outputs.commit_count }}" diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 9d7c7c7c..3895e38d 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -6,6 +6,10 @@ on: pull_request: branches: [main, dev] +concurrency: + group: ci-${{ github.ref }} + cancel-in-progress: true + jobs: lint: runs-on: ubuntu-latest @@ -20,7 +24,7 @@ jobs: - name: Run pre-commit run: pre-commit run --all-files --show-diff-on-failure - build: + test: runs-on: ubuntu-latest strategy: matrix: @@ -41,86 +45,38 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install -e ".[dev]" + pip install -e ".[all,dev]" pip install -r requirements-dev.txt - pip install -e ".[nlp,cli]" pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz - - name: Run test suite (ignore segfault during cleanup) - run: | - python -m pytest tests/ -v --ignore=tests/test_gliner_annotator.py || echo "Tests completed successfully, segfault during cleanup ignored" - - - name: Verify test results (check for test failures vs cleanup segfault) + - name: Run tests with segfault protection run: | - # Run tests again to capture just the test results without letting segfault fail the job - python -m pytest tests/ -v --ignore=tests/test_gliner_annotator.py > test_results.txt 2>&1 || true - - # Check if tests actually passed - if grep -q "failed" test_results.txt; then - echo "โŒ Tests actually failed:" - cat test_results.txt - exit 1 - elif grep -q "passed" test_results.txt; then - echo "โœ… Tests passed successfully (cleanup segfault ignored)" - grep "passed" test_results.txt - else - echo "โš ๏ธ Unable to determine test status" - cat test_results.txt - exit 1 - fi - + python run_tests.py tests/ --ignore=tests/test_gliner_annotator.py --cov-report=xml --cov-config=.coveragerc + - name: Validate GLiNER module structure (without PyTorch dependencies) run: | python -c " print('Validating GLiNER module can be imported without PyTorch...') try: from datafog.processing.text_processing.gliner_annotator import GLiNERAnnotator - print('โŒ GLiNER imported unexpectedly - PyTorch may be installed') + print('GLiNER imported unexpectedly - PyTorch may be installed') except ImportError as e: if 'GLiNER dependencies not available' in str(e): - print('โœ… GLiNER properly reports missing dependencies (expected in CI)') + print('GLiNER properly reports missing dependencies (expected in CI)') else: - print(f'โœ… GLiNER import blocked as expected: {e}') + print(f'GLiNER import blocked as expected: {e}') except Exception as e: - print(f'โŒ Unexpected GLiNER error: {e}') + print(f'Unexpected GLiNER error: {e}') exit(1) " - - - name: Run coverage - run: | - python -m pytest tests/ -v --ignore=tests/test_gliner_annotator.py --cov=datafog --cov-report=xml --cov-config=.coveragerc - name: Upload coverage + if: matrix.python-version == '3.10' uses: codecov/codecov-action@v4 with: file: ./coverage.xml token: ${{ secrets.CODECOV_TOKEN }} - test-core: - runs-on: ubuntu-latest - strategy: - matrix: - python-version: ["3.10", "3.11", "3.12"] - steps: - - uses: actions/checkout@v4 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v5 - with: - python-version: ${{ matrix.python-version }} - cache: "pip" - - - name: Install core dependencies only - run: | - python -m pip install --upgrade pip - pip install -e . - pip install pytest pytest-cov - - - name: Test core functionality - run: | - python -c "from datafog import detect_pii, anonymize_text; print('Core API works')" - python -c "from datafog import detect, process; print('Legacy API works')" - python -m pytest tests/test_regex_annotator.py -v - wheel-size: runs-on: ubuntu-latest steps: diff --git a/.github/workflows/nightly-release.yml b/.github/workflows/nightly-release.yml deleted file mode 100644 index 3898203e..00000000 --- a/.github/workflows/nightly-release.yml +++ /dev/null @@ -1,179 +0,0 @@ -name: Nightly Release (Alpha) - -on: - schedule: - # Monday-Wednesday: Alpha builds at 2 AM UTC - - cron: '0 2 * * 1-3' - workflow_dispatch: - inputs: - dry_run: - description: 'Dry run (skip PyPI publish)' - required: false - default: 'false' - type: boolean - force_build: - description: 'Force build even if no changes' - required: false - default: 'false' - type: boolean - -jobs: - check-changes: - runs-on: ubuntu-latest - outputs: - has_changes: ${{ steps.changes.outputs.has_changes }} - commit_count: ${{ steps.changes.outputs.commit_count }} - steps: - - uses: actions/checkout@v4 - with: - fetch-depth: 0 - ref: dev - - - name: Check for changes since last alpha release - id: changes - run: | - LAST_ALPHA=$(git tag -l "*alpha*" --sort=-version:refname | head -n1) - - if [ -z "$LAST_ALPHA" ]; then - echo "No previous alpha release found, checking last 24 hours" - SINCE="24 hours ago" - COMMIT_COUNT=$(git rev-list --count --since="$SINCE" dev) - else - echo "Last alpha release: $LAST_ALPHA" - COMMIT_COUNT=$(git rev-list --count ${LAST_ALPHA}..dev) - fi - - echo "Commits since last alpha: $COMMIT_COUNT" - echo "commit_count=$COMMIT_COUNT" >> $GITHUB_OUTPUT - - if [ "$COMMIT_COUNT" -gt 0 ] || [ "${{ github.event.inputs.force_build }}" = "true" ]; then - echo "has_changes=true" >> $GITHUB_OUTPUT - echo "โœ… Changes detected, proceeding with nightly build" - else - echo "has_changes=false" >> $GITHUB_OUTPUT - echo "โ„น๏ธ No changes since last alpha, skipping build" - fi - - nightly-release: - needs: check-changes - if: needs.check-changes.outputs.has_changes == 'true' - runs-on: ubuntu-latest - outputs: - alpha_version: ${{ steps.version.outputs.alpha_version }} - steps: - - uses: actions/checkout@v4 - with: - fetch-depth: 0 - ref: dev - token: ${{ secrets.GITHUB_TOKEN }} - - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: '3.11' - - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install bump2version build twine - pip install -e ".[all,dev]" - - - name: Configure git - run: | - git config --local user.email "action@github.com" - git config --local user.name "GitHub Action" - - - name: Generate alpha version - id: version - run: | - set -e - - CURRENT_VERSION=$(python -c "from datafog.__about__ import __version__; print(__version__)") - echo "Current version: $CURRENT_VERSION" - - DATE_STAMP=$(date +"%Y%m%d") - TIME_STAMP=$(date +"%H%M") - COMMIT_SHORT=$(git rev-parse --short HEAD) - - if [[ $CURRENT_VERSION == *"alpha"* ]]; then - BASE_VERSION=$(echo $CURRENT_VERSION | cut -d'a' -f1) - else - BASE_VERSION=$(python3 -c "import sys; version='$CURRENT_VERSION'; parts=version.split('.'); parts[1]=str(int(parts[1])+1); parts[2]='0'; print('.'.join(parts))") - fi - - ALPHA_VERSION="${BASE_VERSION}a${DATE_STAMP}.${TIME_STAMP}.${COMMIT_SHORT}" - echo "Alpha version: $ALPHA_VERSION" - echo "alpha_version=$ALPHA_VERSION" >> $GITHUB_OUTPUT - - sed -i "s/__version__ = \".*\"/__version__ = \"$ALPHA_VERSION\"/" datafog/__about__.py - sed -i "s/version=\".*\"/version=\"$ALPHA_VERSION\"/" setup.py - - - name: Generate changelog for alpha - run: | - python scripts/generate_changelog.py --alpha --output ALPHA_CHANGELOG.md - - - name: Build package - run: | - python -m build - python scripts/check_wheel_size.py - - - name: Create alpha release - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: | - ALPHA_VERSION="${{ steps.version.outputs.alpha_version }}" - - git add datafog/__about__.py setup.py - git commit -m "chore: bump version to $ALPHA_VERSION for nightly release" - git tag -a "v$ALPHA_VERSION" -m "Alpha release $ALPHA_VERSION" - git push origin "v$ALPHA_VERSION" - - gh release create "v$ALPHA_VERSION" \ - --title "๐ŸŒ™ Nightly Alpha $ALPHA_VERSION" \ - --notes-file ALPHA_CHANGELOG.md \ - --prerelease \ - --target dev \ - dist/* - - - name: Publish to PyPI (Alpha) - if: github.event.inputs.dry_run != 'true' - env: - TWINE_USERNAME: __token__ - TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }} - run: | - echo "๐Ÿš€ Publishing alpha release to PyPI..." - python -m twine upload dist/* --verbose - - - name: Dry run summary - if: github.event.inputs.dry_run == 'true' - run: | - echo "๐Ÿƒโ€โ™‚๏ธ DRY RUN COMPLETED" - echo "Would have published: ${{ steps.version.outputs.alpha_version }}" - echo "Package contents:" - ls -la dist/ - - - name: Cleanup old alpha releases - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: | - echo "๐Ÿงน Cleaning up old alpha releases (keep last 7)..." - - ALPHA_RELEASES=$(gh release list --limit 50 | grep alpha | tail -n +8 | cut -f3) - - for release in $ALPHA_RELEASES; do - echo "Deleting old alpha release: $release" - gh release delete "$release" --yes || true - git push --delete origin "$release" || true - done - - notify-alpha: - needs: [check-changes, nightly-release] - if: needs.check-changes.outputs.has_changes == 'true' && success() - runs-on: ubuntu-latest - steps: - - name: Alpha release notification - run: | - echo "๐ŸŒ™ Nightly alpha release completed!" - echo "๐Ÿ“ฆ New alpha version available for testing" - echo "๐Ÿ’ก Install with: pip install datafog==${{ needs.nightly-release.outputs.alpha_version }}" - echo "๐Ÿ“Š Commits included: ${{ needs.check-changes.outputs.commit_count }}" diff --git a/.github/workflows/pre-commit-auto-fix.yml b/.github/workflows/pre-commit-auto-fix.yml deleted file mode 100644 index 21cae40b..00000000 --- a/.github/workflows/pre-commit-auto-fix.yml +++ /dev/null @@ -1,44 +0,0 @@ -name: Auto-fix Pre-commit Issues - -on: - pull_request: - types: [opened, synchronize] - -jobs: - auto-fix: - runs-on: ubuntu-latest - permissions: - contents: write - pull-requests: write - steps: - - uses: actions/checkout@v4 - with: - token: ${{ secrets.GITHUB_TOKEN }} - fetch-depth: 0 - - - uses: actions/setup-python@v5 - with: - python-version: "3.10" - cache: "pip" - - - name: Install pre-commit - run: pip install pre-commit - - - name: Run pre-commit and auto-fix - id: pre-commit - run: | - # Try to run pre-commit and capture exit code - if pre-commit run --all-files; then - echo "changes=false" >> $GITHUB_OUTPUT - else - echo "changes=true" >> $GITHUB_OUTPUT - fi - - - name: Commit auto-fixes - if: steps.pre-commit.outputs.changes == 'true' - run: | - git config --local user.email "action@github.com" - git config --local user.name "GitHub Action" - git add . - git commit -m "๐Ÿค– Auto-fix pre-commit issues" || exit 0 - git push diff --git a/.github/workflows/publish-pypi.yml b/.github/workflows/publish-pypi.yml deleted file mode 100644 index 63356d9b..00000000 --- a/.github/workflows/publish-pypi.yml +++ /dev/null @@ -1,164 +0,0 @@ -name: PyPI Release - -on: - # Manual trigger with version input - workflow_dispatch: - inputs: - version: - description: "Version to release (e.g., 1.2.3)" - required: true - confirm_tests: - description: "Confirm all tests have passed" - type: boolean - required: true - is_prerelease: - description: "Is this a pre-release?" - type: boolean - default: false - required: false - -jobs: - # Job for manual releases (stable or pre-release) - manual_release: - runs-on: ubuntu-latest - if: github.event_name == 'workflow_dispatch' && github.event.inputs.confirm_tests == 'true' - permissions: - contents: write - steps: - - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: "3.10" - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install build twine - - name: Build package - run: python -m build - - name: Create GitHub Release - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: | - git config user.name github-actions - git config user.email github-actions@github.com - git tag v${{ github.event.inputs.version }} - git push origin v${{ github.event.inputs.version }} - if [ "${{ github.event.inputs.is_prerelease }}" == "true" ]; then - gh release create v${{ github.event.inputs.version }} --prerelease --generate-notes - else - gh release create v${{ github.event.inputs.version }} --generate-notes - fi - - name: Publish to PyPI - env: - TWINE_USERNAME: __token__ - TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }} - run: twine upload dist/* - - # Job for automatic beta releases on merge to dev - auto_beta_release: - runs-on: ubuntu-latest - if: github.event_name == 'push' && github.ref == 'refs/heads/dev' - permissions: - contents: write - steps: - - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: "3.10" - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install build twine setuptools-scm - - name: Generate beta version - id: beta_version - run: | - # Read current version from setup.py - CURRENT_VERSION=$(grep -o '__version__ = "[^"]*"' setup.py | sed 's/__version__ = "\(.*\)"/\1/') - echo "Current version in files: $CURRENT_VERSION" - - # Split version into components - IFS='.' read -r MAJOR MINOR PATCH_FULL <<< "$CURRENT_VERSION" || true - - # Validate we got valid version components - if [[ -z "$MAJOR" || -z "$MINOR" || -z "$PATCH_FULL" ]]; then - echo "Error: Could not parse version components from $CURRENT_VERSION" - echo "Using default version 0.0.1b1" - MAJOR=0 - MINOR=0 - PATCH_FULL=1 - fi - - # Handle beta suffix if it exists - if [[ $PATCH_FULL == *b* ]]; then - # Extract the numeric part before 'b' - PATCH_NUM=${PATCH_FULL%%b*} - # Extract the beta number and increment it - BETA_NUM=${PATCH_FULL#*b} - # Ensure beta number is a valid integer - if ! [[ $BETA_NUM =~ ^[0-9]+$ ]]; then - echo "Warning: Invalid beta number format. Resetting to beta1." - BETA_NUM=1 - else - BETA_NUM=$((BETA_NUM + 1)) - fi - else - # If not already a beta, use the patch number and start with beta1 - PATCH_NUM=$PATCH_FULL - BETA_NUM=1 - fi - - # Generate new beta version - BETA_VERSION="$MAJOR.$MINOR.${PATCH_NUM}b$BETA_NUM" - echo "Generated beta version: $BETA_VERSION" - echo "version=$BETA_VERSION" >> $GITHUB_OUTPUT - - # Update version in setup.py - sed -i "s/__version__ = \"[^\"]*\"/__version__ = \"$BETA_VERSION\"/g" setup.py - - # Update version in __about__.py if it exists - if [ -f "datafog/__about__.py" ]; then - sed -i "s/__version__ = \"[^\"]*\"/__version__ = \"$BETA_VERSION\"/g" datafog/__about__.py - fi - - name: Build package - run: python -m build - - name: Create GitHub Pre-Release - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - BETA_VERSION: ${{ steps.beta_version.outputs.version }} - run: | - git config user.name github-actions - git config user.email github-actions@github.com - - # Commit the version changes - git add setup.py datafog/__about__.py - git commit -m "Bump version to $BETA_VERSION [skip ci]" - - # Create and push tag - git tag v$BETA_VERSION - - # Create a new branch for the version bump - git checkout -b bump-version-to-$BETA_VERSION - - # Push the branch and tag - git push origin bump-version-to-$BETA_VERSION - git push origin v$BETA_VERSION - - # Create a pull request for the version bump - gh pr create --base dev --head bump-version-to-$BETA_VERSION --title "Bump version to $BETA_VERSION" --body "Automated version bump to $BETA_VERSION" - - # Create GitHub release - gh release create v$BETA_VERSION --prerelease --title "Beta Release v$BETA_VERSION" --notes "Automated beta release from dev branch" - - name: Publish to PyPI as Beta - env: - TWINE_USERNAME: __token__ - TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }} - BETA_VERSION: ${{ steps.beta_version.outputs.version }} - run: | - # Ensure package is marked as beta in PyPI - twine upload --skip-existing dist/* diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 00000000..39ddc3eb --- /dev/null +++ b/.github/workflows/release.yml @@ -0,0 +1,312 @@ +name: Release + +on: + schedule: + # Monday-Wednesday at 2 AM UTC: alpha builds from dev + - cron: "0 2 * * 1-3" + # Thursday at 2 AM UTC: beta builds from dev + - cron: "0 2 * * 4" + workflow_dispatch: + inputs: + release_type: + description: "Release type" + required: true + type: choice + options: + - alpha + - beta + - stable + dry_run: + description: "Dry run (skip PyPI publish)" + required: false + default: false + type: boolean + force_build: + description: "Force build even if no changes" + required: false + default: false + type: boolean + version_override: + description: "Override version (e.g. 4.4.0) โ€” stable only" + required: false + type: string + +concurrency: + group: release-${{ github.ref }} + cancel-in-progress: false + +jobs: + # โ”€โ”€ 1. Determine release type and check for changes โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + determine-release: + runs-on: ubuntu-latest + outputs: + release_type: ${{ steps.resolve.outputs.release_type }} + has_changes: ${{ steps.changes.outputs.has_changes }} + target_branch: ${{ steps.resolve.outputs.target_branch }} + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + ref: dev + + - name: Resolve release type + id: resolve + run: | + if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then + TYPE="${{ inputs.release_type }}" + elif [ "${{ github.event.schedule }}" = "0 2 * * 4" ]; then + TYPE="beta" + else + TYPE="alpha" + fi + + if [ "$TYPE" = "stable" ]; then + BRANCH="main" + else + BRANCH="dev" + fi + + echo "release_type=$TYPE" >> $GITHUB_OUTPUT + echo "target_branch=$BRANCH" >> $GITHUB_OUTPUT + echo "Release type: $TYPE from $BRANCH" + + - name: Check for changes + id: changes + run: | + TYPE="${{ steps.resolve.outputs.release_type }}" + + if [ "$TYPE" = "alpha" ]; then + LAST_TAG=$(git tag -l "*a*" --sort=-version:refname | head -n1) + elif [ "$TYPE" = "beta" ]; then + LAST_TAG=$(git tag -l "*b*" --sort=-version:refname | head -n1) + else + LAST_TAG=$(git describe --tags --abbrev=0 --match "v[0-9]*.[0-9]*.[0-9]" 2>/dev/null || echo "") + fi + + if [ -z "$LAST_TAG" ]; then + COMMIT_COUNT=$(git rev-list --count --since="7 days ago" HEAD) + else + COMMIT_COUNT=$(git rev-list --count ${LAST_TAG}..HEAD) + fi + + echo "Commits since ${LAST_TAG:-'(none)'}: $COMMIT_COUNT" + + if [ "$COMMIT_COUNT" -gt 0 ] || [ "${{ inputs.force_build }}" = "true" ]; then + echo "has_changes=true" >> $GITHUB_OUTPUT + else + echo "has_changes=false" >> $GITHUB_OUTPUT + echo "No changes detected, skipping release" + fi + + # โ”€โ”€ 2. Test gate โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + test: + needs: determine-release + if: needs.determine-release.outputs.has_changes == 'true' + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.10", "3.11", "3.12"] + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + ref: ${{ needs.determine-release.outputs.target_branch }} + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + cache: "pip" + + - name: Install Tesseract OCR + run: | + sudo apt-get update + sudo apt-get install -y tesseract-ocr libtesseract-dev + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -e ".[all,dev]" + pip install -r requirements-dev.txt + pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz + + - name: Run tests with segfault protection + run: | + python run_tests.py tests/ --ignore=tests/test_gliner_annotator.py --cov-report=xml --cov-config=.coveragerc + + - name: Run performance validation + run: | + OMP_NUM_THREADS=4 MKL_NUM_THREADS=4 OPENBLAS_NUM_THREADS=4 python tests/simple_performance_test.py + + # โ”€โ”€ 3. Build & Publish โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + publish: + needs: [determine-release, test] + runs-on: ubuntu-latest + outputs: + version: ${{ steps.version.outputs.version }} + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + ref: ${{ needs.determine-release.outputs.target_branch }} + token: ${{ secrets.GH_PAT }} + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.11" + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install build twine bump2version + + - name: Configure git + run: | + git config --local user.email "action@github.com" + git config --local user.name "GitHub Action" + + - name: Generate version + id: version + run: | + set -e + git fetch --tags + + TYPE="${{ needs.determine-release.outputs.release_type }}" + CURRENT=$(python -c "from datafog.__about__ import __version__; print(__version__)") + echo "Current version: $CURRENT" + + # Strip any pre-release suffix to get base version + BASE=$(echo "$CURRENT" | sed -E 's/(a|b)[0-9.]+$//') + echo "Base version: $BASE" + + if [ "$TYPE" = "alpha" ]; then + DATE_STAMP=$(date +"%Y%m%d") + COMMIT_SHORT=$(git rev-parse --short HEAD) + VERSION="${BASE}a${DATE_STAMP}.${COMMIT_SHORT}" + + elif [ "$TYPE" = "beta" ]; then + BETA_NUM=1 + while git tag -l "v${BASE}b${BETA_NUM}" | grep -q .; do + BETA_NUM=$((BETA_NUM + 1)) + done + VERSION="${BASE}b${BETA_NUM}" + + else + # Stable: use override or base version + if [ -n "${{ inputs.version_override }}" ]; then + VERSION="${{ inputs.version_override }}" + else + VERSION="$BASE" + fi + fi + + echo "version=$VERSION" >> $GITHUB_OUTPUT + echo "Publishing version: $VERSION" + + sed -i "s/__version__ = \".*\"/__version__ = \"$VERSION\"/" datafog/__about__.py + if grep -q 'version="' setup.py 2>/dev/null; then + sed -i "s/version=\".*\"/version=\"$VERSION\"/" setup.py + fi + + - name: Generate changelog + run: | + TYPE="${{ needs.determine-release.outputs.release_type }}" + if [ "$TYPE" = "alpha" ]; then + python scripts/generate_changelog.py --alpha --output RELEASE_CHANGELOG.md + elif [ "$TYPE" = "beta" ]; then + python scripts/generate_changelog.py --beta --output RELEASE_CHANGELOG.md + else + python scripts/generate_changelog.py --output RELEASE_CHANGELOG.md + fi + + - name: Build package + run: | + python -m build + python scripts/check_wheel_size.py + + - name: Publish to PyPI + if: inputs.dry_run != true + env: + TWINE_USERNAME: __token__ + TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }} + run: | + python -m twine upload dist/* --verbose + + - name: Commit version bump & create release + env: + GITHUB_TOKEN: ${{ secrets.GH_PAT }} + run: | + VERSION="${{ steps.version.outputs.version }}" + TYPE="${{ needs.determine-release.outputs.release_type }}" + BRANCH="${{ needs.determine-release.outputs.target_branch }}" + + git add datafog/__about__.py setup.py + git commit -m "chore: bump version to $VERSION [skip ci]" || true + git push origin "$BRANCH" || true + + git tag -a "v$VERSION" -m "Release $VERSION" + git push origin "v$VERSION" + + PRERELEASE_FLAG="" + TITLE="" + if [ "$TYPE" = "alpha" ]; then + PRERELEASE_FLAG="--prerelease" + TITLE="Nightly Alpha $VERSION" + elif [ "$TYPE" = "beta" ]; then + PRERELEASE_FLAG="--prerelease" + TITLE="Beta Release $VERSION" + else + TITLE="DataFog v$VERSION" + fi + + gh release create "v$VERSION" \ + --title "$TITLE" \ + --notes-file RELEASE_CHANGELOG.md \ + $PRERELEASE_FLAG \ + --target "$BRANCH" \ + dist/* + + - name: Dry run summary + if: inputs.dry_run == true + run: | + echo "DRY RUN COMPLETE" + echo "Would have published: ${{ steps.version.outputs.version }}" + echo "Package contents:" + ls -la dist/ + + # โ”€โ”€ 4. Cleanup old pre-releases โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + cleanup: + needs: [determine-release, publish] + if: needs.determine-release.outputs.release_type != 'stable' + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Prune old alpha releases (keep 7) + if: needs.determine-release.outputs.release_type == 'alpha' + env: + GITHUB_TOKEN: ${{ secrets.GH_PAT }} + run: | + echo "Cleaning up old alpha releases (keep last 7)..." + ALPHA_RELEASES=$(gh release list --limit 50 | grep -i alpha | tail -n +8 | cut -f3) + + for release in $ALPHA_RELEASES; do + echo "Deleting old alpha release: $release" + gh release delete "$release" --yes || true + git push --delete origin "$release" 2>/dev/null || true + done + + - name: Prune old beta releases (keep 5) + if: needs.determine-release.outputs.release_type == 'beta' + env: + GITHUB_TOKEN: ${{ secrets.GH_PAT }} + run: | + echo "Cleaning up old beta releases (keep last 5)..." + BETA_RELEASES=$(gh release list --limit 30 | grep -i beta | tail -n +6 | cut -f3) + + for release in $BETA_RELEASES; do + echo "Deleting old beta release: $release" + gh release delete "$release" --yes || true + git push --delete origin "$release" 2>/dev/null || true + done diff --git a/.github/workflows/weekly-release.yml b/.github/workflows/weekly-release.yml deleted file mode 100644 index 111fe537..00000000 --- a/.github/workflows/weekly-release.yml +++ /dev/null @@ -1,112 +0,0 @@ -name: Weekly Release - -on: - schedule: - # Every Friday at 2 PM UTC - - cron: "0 14 * * 5" - workflow_dispatch: - inputs: - release_type: - description: "Release type" - required: true - default: "patch" - type: choice - options: - - patch - - minor - - major - -jobs: - release: - runs-on: ubuntu-latest - if: github.ref == 'refs/heads/dev' - - steps: - - uses: actions/checkout@v4 - with: - fetch-depth: 0 - token: ${{ secrets.GITHUB_TOKEN }} - - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: "3.10" - - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install bump2version build twine - pip install -e .[all] - - - name: Run full test suite - run: | - python -m pytest tests/ --cov=datafog - python -m pytest tests/benchmark_text_service.py - - - name: Generate changelog - run: | - python scripts/generate_changelog.py - - - name: Determine version bump - id: version - run: | - if [ "${{ github.event_name }}" == "workflow_dispatch" ]; then - echo "bump_type=${{ github.event.inputs.release_type }}" >> $GITHUB_OUTPUT - else - # Auto-determine based on commit messages - if git log --oneline $(git describe --tags --abbrev=0)..HEAD | grep -q "BREAKING"; then - echo "bump_type=major" >> $GITHUB_OUTPUT - elif git log --oneline $(git describe --tags --abbrev=0)..HEAD | grep -q "feat:"; then - echo "bump_type=minor" >> $GITHUB_OUTPUT - else - echo "bump_type=patch" >> $GITHUB_OUTPUT - fi - fi - - - name: Bump version - run: | - git config --local user.email "action@github.com" - git config --local user.name "GitHub Action" - bump2version ${{ steps.version.outputs.bump_type }} - echo "NEW_VERSION=$(python -c 'from datafog import __version__; print(__version__)')" >> $GITHUB_ENV - - - name: Build package - run: | - python -m build - - - name: Check wheel size - run: | - WHEEL_SIZE=$(du -m dist/*.whl | cut -f1) - if [ "$WHEEL_SIZE" -ge 5 ]; then - echo "โŒ Wheel size too large: ${WHEEL_SIZE}MB" - exit 1 - fi - echo "โœ… Wheel size OK: ${WHEEL_SIZE}MB" - - - name: Publish to PyPI - env: - TWINE_USERNAME: __token__ - TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }} - run: twine upload dist/* - - - name: Create GitHub Release - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: | - gh release create v${{ env.NEW_VERSION }} \ - --title "DataFog v${{ env.NEW_VERSION }}" \ - --notes-file CHANGELOG_LATEST.md \ - dist/* - - - name: Push changes - run: | - git push origin dev --tags - - - name: Notify Discord - if: env.DISCORD_WEBHOOK - env: - DISCORD_WEBHOOK: ${{ secrets.DISCORD_WEBHOOK }} - run: | - curl -X POST "$DISCORD_WEBHOOK" \ - -H "Content-Type: application/json" \ - -d "{\"content\": \"๐Ÿš€ DataFog v${{ env.NEW_VERSION }} is live! Install with: \`pip install datafog==${{ env.NEW_VERSION }}\`\"}" diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 57a996fc..65a35656 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -38,3 +38,16 @@ repos: .venv| .*\.github/workflows/.*\.ya?ml$ )$ + + - repo: https://github.com/gitleaks/gitleaks + rev: v8.18.2 + hooks: + - id: gitleaks + + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.5.0 + hooks: + - id: check-added-large-files + args: [--maxkb=1024] + - id: check-merge-conflict + - id: check-yaml diff --git a/Claude.md b/Claude.md index 5bbece7d..dcbe7934 100644 --- a/Claude.md +++ b/Claude.md @@ -10,20 +10,14 @@ - **Production Ready**: Comprehensive testing, CI/CD, and performance validation ## Current Project Status -**Version: 4.1.1** โ†’ **Targeting 4.2.0** with GLiNER integration +**Version: 4.3.0** ### โœ… Recently Completed (Latest) - **GLiNER Integration**: Modern NER engine with PII-specialized models -- **Smart Cascading**: Intelligent regex โ†’ GLiNER โ†’ spaCy progression +- **Smart Cascading**: Intelligent regex โ†’ GLiNER โ†’ spaCy progression - **Enhanced CLI**: Model management with `--engine` flags - **Performance Validation**: 190x regex, 32x GLiNER benchmarks confirmed -- **Comprehensive Testing**: 87% pass rate (156/180 tests) - -### ๐ŸŽฏ Current Focus (v4.2.0) -- **Final test cleanup**: Address remaining test failures -- **GLiNER refinement**: Optimize cascading thresholds -- **Documentation polish**: Update all GLiNER references -- **Release preparation**: Version bump and changelog +- **CI/CD Consolidation**: 7 workflows โ†’ 3 (ci, release, benchmark) ## Quick Development Setup @@ -219,19 +213,31 @@ except ImportError: ## CI/CD & Release Process -### Automated Validation -- **Tests**: Python 3.10-3.12 across all platforms -- **Performance**: Regression detection with 10% threshold -- **Package Size**: <2MB core, <8MB full enforcement -- **Pre-commit**: Code formatting and linting +### Workflow Architecture (3 workflows) + +| Workflow | Purpose | Trigger | +|----------|---------|---------| +| `ci.yml` | Lint + Test + Coverage + Wheel size | Push/PR to main/dev | +| `release.yml` | Alpha/Beta/Stable publishing | Schedule + manual dispatch | +| `benchmark.yml` | Performance benchmarks | Push/PR/weekly | + +### Release Cadence +- **Alpha** (Mon-Wed 2AM UTC): Automatic from `dev`, date+commit versioning +- **Beta** (Thursday 2AM UTC): Automatic from `dev`, incremental beta numbers +- **Stable** (manual dispatch): From `main`, base version or override + +### Release Pipeline +`determine-release` โ†’ `test` โ†’ `publish` โ†’ `cleanup` +- Tests are a hard gate โ€” no tests = no publish +- Stable releases check out `main`; alpha/beta check out `dev` +- Old alphas pruned to 7, betas to 5 +- `[skip ci]` in version bump commits to prevent loops -### Release Workflow -1. **Feature complete**: All planned changes implemented -2. **Tests passing**: Full CI green across all platforms -3. **Performance validated**: No regression in benchmarks -4. **Documentation updated**: README, CHANGELOG, examples current -5. **Version bump**: Update `__about__.py` and `setup.py` -6. **Release tag**: Deploy via GitHub Actions +### Pre-commit Hooks +- **isort**, **black**, **flake8**, **ruff**: Code formatting and linting +- **prettier**: Markdown, JSON, YAML formatting +- **gitleaks**: Secret scanning +- **pre-commit-hooks**: Large file checks, merge conflict detection, YAML validation ## Environment Variables ```bash @@ -267,10 +273,10 @@ Before beginning any task please checkout a branch from `dev` and create a pull - Consider model download time and caching strategies ### Release Preparation -- Feature freeze by Thursday for Friday releases +- Alpha/beta releases are automated via `release.yml` schedule +- Stable releases: merge `dev` โ†’ `main`, then trigger `release.yml` with `stable` type +- Use `dry_run: true` to validate before actual publish - Performance validation on realistic data sets -- Cross-platform testing (Linux, macOS, Windows) -- Community-facing documentation and examples -- In Release Notes or Comments, do not reference that it was sauthored by Claude (all code is anonymously authored) +- In Release Notes or Comments, do not reference that it was authored by Claude (all code is anonymously authored) This guide provides the essential information for DataFog development while maintaining focus on current priorities and recent GLiNER integration work. \ No newline at end of file diff --git a/scripts/generate_changelog.py b/scripts/generate_changelog.py index 91b4f0f4..bd089e74 100755 --- a/scripts/generate_changelog.py +++ b/scripts/generate_changelog.py @@ -61,7 +61,7 @@ def categorize_commits(commits): return categories -def generate_changelog(beta=False): +def generate_changelog(beta=False, alpha=False): """Generate changelog content.""" latest_tag = get_latest_tag() commits = get_commits_since_tag(latest_tag) @@ -71,7 +71,11 @@ def generate_changelog(beta=False): categories = categorize_commits(commits) - if beta: + if alpha: + changelog = "# Alpha Release Notes\n\n" + changelog += f"*Alpha Build: {datetime.now().strftime('%Y-%m-%d')}*\n\n" + changelog += "โš ๏ธ **This is an alpha build for early testing. Expect rough edges.**\n\n" + elif beta: changelog = "# Beta Release Notes\n\n" changelog += f"*Beta Release: {datetime.now().strftime('%Y-%m-%d')}*\n\n" changelog += "โš ๏ธ **This is a beta release for testing purposes.**\n\n" @@ -128,6 +132,9 @@ def generate_changelog(beta=False): if __name__ == "__main__": parser = argparse.ArgumentParser(description="Generate changelog for releases") + parser.add_argument( + "--alpha", action="store_true", help="Generate alpha release changelog" + ) parser.add_argument( "--beta", action="store_true", help="Generate beta release changelog" ) @@ -137,7 +144,7 @@ def generate_changelog(beta=False): args = parser.parse_args() - changelog_content = generate_changelog(beta=args.beta) + changelog_content = generate_changelog(beta=args.beta, alpha=args.alpha) # Write to file for GitHub release with open(args.output, "w") as f: From 63cd878a9b910238e9a2e42e590850d47096bde2 Mon Sep 17 00:00:00 2001 From: Sid Mohan Date: Sun, 1 Feb 2026 15:28:33 -0800 Subject: [PATCH 2/2] style: fix black formatting in generate_changelog.py Co-Authored-By: Claude Opus 4.5 --- scripts/generate_changelog.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/scripts/generate_changelog.py b/scripts/generate_changelog.py index bd089e74..293ac5b8 100755 --- a/scripts/generate_changelog.py +++ b/scripts/generate_changelog.py @@ -74,7 +74,9 @@ def generate_changelog(beta=False, alpha=False): if alpha: changelog = "# Alpha Release Notes\n\n" changelog += f"*Alpha Build: {datetime.now().strftime('%Y-%m-%d')}*\n\n" - changelog += "โš ๏ธ **This is an alpha build for early testing. Expect rough edges.**\n\n" + changelog += ( + "โš ๏ธ **This is an alpha build for early testing. Expect rough edges.**\n\n" + ) elif beta: changelog = "# Beta Release Notes\n\n" changelog += f"*Beta Release: {datetime.now().strftime('%Y-%m-%d')}*\n\n"