AbhaySingh
Databricks Employee
Databricks Employee

Have you read about following approach before? 

  Repository Structure Options

 

  1. Monorepo with Multiple Bundles

 

  repo-root/

  ├── .github/

    └── workflows/

        ├── bundle-ci.yml

        └── bundle-deploy.yml

  ├── bundles/

    ├── data-engineering/

      ├── databricks.yml

      ├── src/

      └── tests/

    ├── ml-workflows/

      ├── databricks.yml

      ├── src/

      └── tests/

    └── analytics/

        ├── databricks.yml

        ├── src/

        └── tests/

  └── scripts/

      └── detect-changed-bundles.sh

 

  2. Bundle-per-Directory with Shared Resources

 

  repo-root/

  ├── .github/workflows/

  ├── shared/

    ├── libraries/

    └── configurations/

  ├── bundle-1/

    └── databricks.yml

  ├── bundle-2/

    └── databricks.yml

  └── bundle-3/

      └── databricks.yml

 

  Dynamic Bundle Detection Strategy

 

  The key challenge you're facing is detecting which bundles changed in a PR. Here's a

  practical approach:

 

  GitHub Actions Workflow with Dynamic Detection

 

  name: Deploy Databricks Bundles

 

  on:

    pull_request:

      branches: [main, develop]

    push:

      branches: [main, develop]

 

  jobs:

    detect-changed-bundles:

      runs-on: ubuntu-latest

      outputs:

        bundles: ${{ steps.set-bundles.outputs.bundles }}

        has-changes: ${{ steps.set-bundles.outputs.has-changes }}

      steps:

        - uses: actions/checkout@v3

          with:

            fetch-depth: 0  # Important for git diff

 

        - name: Detect changed bundles

          id: set-bundles

          run: |

            # Get the base branch for comparison

            if [ "${{ github.event_name }}" == "pull_request" ]; then

              BASE_REF="${{ github.event.pull_request.base.sha }}"

            else

              BASE_REF="HEAD~1"

            fi

 

            # Find all bundle directories (containing databricks.yml)

            ALL_BUNDLES=$(find bundles -name "databricks.yml" -exec dirname {} \; | sort

  -u)

 

            # Detect which bundles have changes

            CHANGED_BUNDLES=""

            for bundle_dir in $ALL_BUNDLES; do

              if git diff --name-only $BASE_REF HEAD | grep -q "^${bundle_dir}/"; then

                bundle_name=$(basename $bundle_dir)

                CHANGED_BUNDLES="$CHANGED_BUNDLES\"$bundle_name\","

              fi

            done

 

            # Format as JSON array for matrix strategy

            if [ -n "$CHANGED_BUNDLES" ]; then

              CHANGED_BUNDLES="[${CHANGED_BUNDLES%,}]"

              echo "bundles=$CHANGED_BUNDLES" >> $GITHUB_OUTPUT

              echo "has-changes=true" >> $GITHUB_OUTPUT

            else

              echo "bundles=[]" >> $GITHUB_OUTPUT

              echo "has-changes=false" >> $GITHUB_OUTPUT

            fi

 

            echo "Changed bundles: $CHANGED_BUNDLES"

 

    validate-bundles:

      needs: detect-changed-bundles

      if: needs.detect-changed-bundles.outputs.has-changes == 'true'

      runs-on: ubuntu-latest

      strategy:

        matrix:

          bundle: ${{ fromJson(needs.detect-changed-bundles.outputs.bundles) }}

      steps:

        - uses: actions/checkout@v3

 

        - name: Setup Databricks CLI

          uses: databricks/setup-cli@main

 

        - name: Validate bundle

          run: |

            cd bundles/${{ matrix.bundle }}

            databricks bundle validate -t dev

          env:

            DATABRICKS_HOST: ${{ secrets.DATABRICKS_HOST }}

            DATABRICKS_TOKEN: ${{ secrets.DATABRICKS_TOKEN }}

 

    deploy-bundles:

      needs: [detect-changed-bundles, validate-bundles]

      if: github.event_name == 'push' && needs.detect-changed-bundles.outputs.has-changes

   == 'true'

      runs-on: ubuntu-latest

      strategy:

        matrix:

          bundle: ${{ fromJson(needs.detect-changed-bundles.outputs.bundles) }}

      steps:

        - uses: actions/checkout@v3

 

        - name: Setup Databricks CLI

          uses: databricks/setup-cli@main

 

        - name: Deploy bundle

          run: |

            cd bundles/${{ matrix.bundle }}

            databricks bundle deploy -t prod

          env:

            DATABRICKS_HOST: ${{ secrets.DATABRICKS_HOST }}

            DATABRICKS_TOKEN: ${{ secrets.DATABRICKS_TOKEN }}

 

  Alternative Approaches

 

  1. Path Filters Approach

 

  on:

    pull_request:

      paths:

        - 'bundles/data-engineering/**'

        - 'bundles/ml-workflows/**'

 

  jobs:

    deploy-data-engineering:

      if: contains(github.event.pull_request.changed_files, 'bundles/data-engineering/')

      # ... deployment steps

 

  2. Bundle Configuration File

 

  Create a bundles-config.json:

  {

    "bundles": [

      {

        "name": "data-engineering",

        "path": "bundles/data-engineering",

        "target": "prod"

      },

      {

        "name": "ml-workflows",

        "path": "bundles/ml-workflows",

        "target": "prod"

      }

    ]

  }

 

  3. Labels-Based Deployment (for PRs)

 

  jobs:

    deploy:

      if: contains(github.event.pull_request.labels.*.name, 'deploy:bundle-name')

      # ... deployment steps