Hi @bharathjs ,
If you want to run it faster then there is a better way. Databricks supports the execution of multiple ALTER COLUMN operations within a single ALTER TABLE statement.
ALTER TABLE your_table_name
ALTER COLUMN col1 SET COMMENT 'comment1',
ALTER COLUMN col2 SET COMMENT 'comment2',
ALTER COLUMN col3 SET COMMENT 'comment3';
You can prepare your SQL programatically and then to run it. I have prepared a test table with 200 columns, random comments and added the comments in one shot - the execution took just 0.18 second.
Here is the sample:
from pyspark.sql.types import StructType, StructField, IntegerType
# Define the number of columns
num_columns = 200
# Dynamically create a list of StructFields
fields = [StructField(f"col{i}", IntegerType(), True) for i in range(1, num_columns + 1)]
# Create the schema
schema = StructType(fields)
# Create an empty DataFrame with the schema
df = spark.createDataFrame([], schema)
# Define the database and table name
database_name = "default" # You can change this as needed
table_name = "test_comments"
# Create the test table
df.write.format("delta").mode("overwrite").saveAsTable(f"{database_name}.{table_name}")
print(f"Test table '{database_name}.{table_name}' with {num_columns} columns created successfully.")
import random
import string
# Function to generate a random comment
def generate_random_comment(length=30):
return ''.join(random.choices(string.ascii_letters + string.digits + ' ', k=length))
# Generate comments for each column
columns_comments = {f"col{i}": generate_random_comment() for i in range(1, num_columns + 1)}
# Display a sample of the generated comments
for col in list(columns_comments.keys())[:5]: # Show first 5 for brevity
print(f"{col}: {columns_comments[col]}")
import time
import logging
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Function to escape single quotes in comments
def escape_single_quotes(comment):
return comment.replace("'", "\\'")
# Start the timer
start_time = time.time()
# Generate ALTER COLUMN statements for all columns
alter_statements = ",\n ".join([
f"ALTER COLUMN `{col}` COMMENT '{escape_single_quotes(comment)}'"
for col, comment in columns_comments.items()
])
# Construct the full ALTER TABLE SQL command
alter_table_sql = f"""
ALTER TABLE hive_metastore.{database_name}.{table_name}
{alter_statements};
"""
# Execute the ALTER TABLE statement
try:
spark.sql(alter_table_sql)
logger.info(f"Successfully updated comments for {len(columns_comments)} columns in '{database_name}.{table_name}'.")
except Exception as e:
logger.error(f"Failed to update comments for table '{database_name}.{table_name}'. Error: {e}")
# End the timer
end_time = time.time()
batched_duration = end_time - start_time
print(f"Batched ALTER TABLE duration: {batched_duration:.2f} seconds.")
The output:
The comments: