cancel
Showing results forย 
Search instead forย 
Did you mean:ย 
Data Engineering
Join discussions on data engineering best practices, architectures, and optimization strategies within the Databricks Community. Exchange insights and solutions with fellow data engineers.
cancel
Showing results forย 
Search instead forย 
Did you mean:ย 

AWS NAT (Network Address Translation) Automated On-demand Destruct / Create

csmcpherson
New Contributor III

Hi folks, 

Our company typically uses Databrick during a 12 hour block, however the AWS NAT for elastic compute is up 24 hours, and I'd rather not pay for those hours.

I gather AWS lambda and cloudwatch can be used to schedule / trigger NAT destruction and creation. 

1. Has anyone tried this with success, and can you provide guidance on best practice here?
2. Are there any important considerations to bear in mind (ie: will removal of NAT also destroy attached route tables / security groups / elastic IP allocation)?

Thank you.

2 REPLIES 2

@Retired_mod 
Thanks for your reply. 


I created some Lambda functions to execute the NAT delete / create approach, factoring in route tables, elastic IP details and security groups per the forum guide. 

However, there is a problem with Databricks not being able to connect to the EC2 resources - the clusters can initiate EC2 instance start up, but cannot connect to the resource, or even terminate it, and Databricks (DLT and compute) is constantly "waiting for resource", even though the instance is running in AWS.

Is there anything that I may have missed?

Lambda functions are below:
== delete lambda ==
csmcpherson_0-1721090211510.png

== create lambda ==

csmcpherson_0-1721090500932.pngcsmcpherson_2-1721090284289.png

csmcpherson
New Contributor III

For interest, this is how I ended up solving the situation, with pointers from AWS support:
<< CREATE NAT >>

import boto3
import logging
from datetime import datetime

ec2 = boto3.client('ec2')
cloudwatch = boto3.client('logs')

def lambda_handler(event, context):
    allocation_id = # 'YOUR_ELASTIC_IP_ALLOCATION_ID'
    subnet_id = # 'YOUR_SUBNET_ID'
    route_table_id = # YOUR_ROUTE_TABLE_ID

    response = ec2.create_nat_gateway(
        AllocationId=allocation_id,
        SubnetId=subnet_id
    )

    nat_gateway_id = response['NatGateway']['NatGatewayId']
    waiter = ec2.get_waiter('nat_gateway_available')
    
    try:
        print(f"Waiting for NAT Gateway {nat_gateway_id} to become available...")
        waiter.wait(NatGatewayIds=[nat_gateway_id],  WaiterConfig={
            'Delay': 15,
            'MaxAttempts': 40}
        )
        print(f"NAT Gateway {nat_gateway_id} is now available.")
        update_route_table(route_table_id, nat_gateway_id)
    except Exception as e:
        print(f"Error waiting for NAT Gateway to become available: {e}")

    try:
        nat_gateway_ip = response['NatGateway']['NatGatewayAddresses'][0]['PublicIp']
    except KeyError as e:
        print(f"KeyError: {e}. The NAT Gateway response does not contain the expected key.")
        nat_gateway_ip = None


    log_nat_gateway_details(nat_gateway_id, nat_gateway_ip, 'Created')

    return {
        'statusCode': 200,
        'body': f'NAT Gateway {nat_gateway_id} created with IP {nat_gateway_ip}'
    }

def update_route_table(route_table_id, nat_gateway_id):
    try:
        # Describe the existing routes in the route table
        route_table = ec2.describe_route_tables(RouteTableIds=[route_table_id])
        routes = route_table['RouteTables'][0]['Routes']
        
        # Check if a route for 0.0.0.0/0 exists and update it
        route_exists = False
        for route in routes:
            if route['DestinationCidrBlock'] == '0.0.0.0/0':
                route_exists = True
                ec2.replace_route(
                    RouteTableId=route_table_id,
                    DestinationCidrBlock='0.0.0.0/0',
                    NatGatewayId=nat_gateway_id
                )
                print(f"Route updated in route table {route_table_id} destination 0.0.0.0/0 to point to NAT Gateway {nat_gateway_id}.")
                break

        # If no existing route for 0.0.0.0/0, create a new route
        if not route_exists:
            ec2.create_route(
                RouteTableId=route_table_id,
                DestinationCidrBlock='0.0.0.0/0',
                NatGatewayId=nat_gateway_id
            )
            print(f"New route created in route table {route_table_id} destination 0.0.0.0/ to point to NAT Gateway {nat_gateway_id}.")
    except Exception as e:
        logging.error(f"Error updating route table: {e}")

def log_nat_gateway_details(nat_gateway_id, nat_gateway_ip, action):
    log_group = 'NATGatewayLogs'
    log_stream = 'NATGatewayActions'

    try:
        cloudwatch.create_log_group(logGroupName=log_group)
    except cloudwatch.exceptions.ResourceAlreadyExistsException:
        pass

    try:
        cloudwatch.create_log_stream(logGroupName=log_group, logStreamName=log_stream)
    except cloudwatch.exceptions.ResourceAlreadyExistsException:
        pass

    timestamp = int(datetime.now().timestamp() * 1000)
    message = f'{action} NAT Gateway: {nat_gateway_id}, IP: {nat_gateway_ip} at {datetime.now()}'

    cloudwatch.put_log_events(
        logGroupName=log_group,
        logStreamName=log_stream,
        logEvents=[
            {
                'timestamp': timestamp,
                'message': message
            }
        ]
    )

<< DELETE NAT >>

import boto3
import logging
from datetime import datetime

ec2 = boto3.client('ec2')
cloudwatch = boto3.client('logs')

def lambda_handler(event, context):
    nat_gateway_id = get_available_nat_gateway_id()

    if nat_gateway_id:
        response = ec2.delete_nat_gateway(
            NatGatewayId=nat_gateway_id
        )
        print(f"NAT Gateway {nat_gateway_id} deleted.")
    else:
        print("No available NAT Gateway found.")

    log_nat_gateway_details(nat_gateway_id, 'Deleted')

    return {
        'statusCode': 200,
        'body': f'NAT Gateway {nat_gateway_id} deletion initiated'
    }

def get_available_nat_gateway_id():
    response = ec2.describe_nat_gateways(
        Filters=[
            {
                'Name': 'state',
                'Values': ['available']
            }
        ]
    )
    for nat_gateway in response['NatGateways']:
        return nat_gateway['NatGatewayId']
    return None

def log_nat_gateway_details(nat_gateway_id, action):
    log_group = 'NATGatewayLogs'
    log_stream = 'NATGatewayActions'

    try:
        cloudwatch.create_log_group(logGroupName=log_group)
    except cloudwatch.exceptions.ResourceAlreadyExistsException:
        pass

    try:
        cloudwatch.create_log_stream(logGroupName=log_group, logStreamName=log_stream)
    except cloudwatch.exceptions.ResourceAlreadyExistsException:
        pass

    timestamp = int(datetime.now().timestamp() * 1000)
    message = f'{action} NAT Gateway: {nat_gateway_id} at {datetime.now()}'

    cloudwatch.put_log_events(
        logGroupName=log_group,
        logStreamName=log_stream,
        logEvents=[
            {
                'timestamp': timestamp,
                'message': message
            }
        ]
    )

 

Connect with Databricks Users in Your Area

Join a Regional User Group to connect with local Databricks users. Events will be happening in your city, and you wonโ€™t want to miss the chance to attend and share knowledge.

If there isnโ€™t a group near you, start one and help create a community that brings people together.

Request a New Group