<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Generating a PostgreSQL Table Schema for ETL in Databricks in Community Articles</title>
    <link>https://community.databricks.com/t5/community-articles/generating-a-postgresql-table-schema-for-etl-in-databricks/m-p/128874#M561</link>
    <description>&lt;P&gt;In a data migration project, I needed to generate the schema of a PostgreSQL table to use in my ETL process. I’d like to share the code snippet in case someone else needs it one day:&lt;/P&gt;&lt;LI-CODE lang="python"&gt;from pyspark.sql import SparkSession
import json
import os
from typing import Dict, Any

# Mapeamento PostgreSQL → Spark SQL Types
POSTGRES_TYPE_MAPPING = {
    'int': 'IntegerType',
    'integer': 'IntegerType',
    'bigint': 'IntegerType',
    'smallint': 'IntegerType',
    'tinyint': 'IntegerType',
    'varchar': 'StringType',
    'character varying': 'StringType',
    'char': 'StringType',
    'text': 'StringType',
    'boolean': 'BooleanType',
    'double': 'DoubleType',
    'float': 'FloatType',
    'real': 'FloatType',
    'numeric': 'DecimalType',
    'decimal': 'DecimalType',
    'date': 'DateType',
    'timestamp': 'TimestampType',
    'timestamp without time zone': 'TimestampType',
    'bytea': 'BinaryType'
}

def adjust_dtype(dtype_str: str) -&amp;gt; str:
    """
    Maps PostgreSQL data types to Spark SQL Types.
    Falls back to StringType for unknown types.
    """
    base_type = dtype_str.lower().strip()
    mapped = POSTGRES_TYPE_MAPPING.get(base_type)
    if not mapped:
        print(f"[WARN] Unmapped type: {base_type}. Defaulting to StringType.")
        return "StringType"
    return mapped

def print_formatted_postgres_schema(schema_dict: Dict[str, Any], table_name: str):
    """
    Prints schema in a formatted dictionary style for readability.
    """
    print(f"{table_name}_schema = {{")
    for field, props in schema_dict.items():
        print(f"    '{field}': {{")
        print(f"        'size': {props['size']},")
        print(f"        'dtype': '{props['dtype']}',")
        print(f"        'unique': {props['unique']},")
        print(f"        'nullable': {props['nullable']}")
        print(f"    }},")
    print("}")

def generate_postgres_schema_dict(
    jdbc_url: str,
    schema: str,
    table: str,
    secret_scope: str,
    export_path: str = None
) -&amp;gt; Dict[str, Any]:
    """
    Generates schema dictionary from PostgreSQL metadata.
    Optionally exports schema as JSON file to DBFS or local path.
    """

    user = dbutils.secrets.get(scope=secret_scope, key="postgres_user")
    password = dbutils.secrets.get(scope=secret_scope, key="postgres_password")

    query = f"""
    SELECT 
        column_name,
        is_nullable = 'YES' AS nullable,
        EXISTS (
            SELECT 1
            FROM information_schema.table_constraints tc
            JOIN information_schema.constraint_column_usage ccu
              ON tc.constraint_name = ccu.constraint_name
            WHERE tc.constraint_type = 'UNIQUE'
              AND tc.table_schema = cols.table_schema
              AND tc.table_name = cols.table_name
              AND ccu.column_name = cols.column_name
        ) AS is_unique,
        data_type,
        character_maximum_length,
        numeric_precision
    FROM information_schema.columns cols
    WHERE table_schema = '{schema}' AND table_name = '{table}'
    """

    # Lê metadados da tabela com JDBC usando apenas .option()
    df_metadata = (
        spark.read.format("jdbc")
        .option("url", jdbc_url)
        .option("query", query)
        .option("user", user)
        .option("password", password)
        .option("driver", "org.postgresql.Driver")
        .load()
    )

    schema_dict = {}
    for row in df_metadata.toLocalIterator():  # mais seguro que collect()
        column = row["column_name"]
        data_type = row["data_type"]
        size = row["character_maximum_length"] or row["numeric_precision"] or None

        schema_dict[column] = {
            "size": size,
            "dtype": adjust_dtype(data_type),
            "unique": row["is_unique"],
            "nullable": row["nullable"]
        }
    
    print_formatted_postgres_schema(schema_dict, table)

    # Optional JSON export
    #if export_path:
    #    with open(export_path, "w") as f:
    #        json.dump(schema_dict, f, indent=4)
    #    print(f"[INFO] Schema exported to {export_path}")

    return schema_dict&lt;/LI-CODE&gt;&lt;P&gt;&lt;span class="lia-inline-image-display-wrapper lia-image-align-inline" image-alt="WiliamRosa_0-1755612791624.png" style="width: 400px;"&gt;&lt;img src="https://community.databricks.com/t5/image/serverpage/image-id/19193iE1E00777383968E7/image-size/medium?v=v2&amp;amp;px=400" role="button" title="WiliamRosa_0-1755612791624.png" alt="WiliamRosa_0-1755612791624.png" /&gt;&lt;/span&gt;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&lt;BR /&gt;&lt;BR /&gt;&lt;/P&gt;</description>
    <pubDate>Tue, 19 Aug 2025 14:13:41 GMT</pubDate>
    <dc:creator>WiliamRosa</dc:creator>
    <dc:date>2025-08-19T14:13:41Z</dc:date>
    <item>
      <title>Generating a PostgreSQL Table Schema for ETL in Databricks</title>
      <link>https://community.databricks.com/t5/community-articles/generating-a-postgresql-table-schema-for-etl-in-databricks/m-p/128874#M561</link>
      <description>&lt;P&gt;In a data migration project, I needed to generate the schema of a PostgreSQL table to use in my ETL process. I’d like to share the code snippet in case someone else needs it one day:&lt;/P&gt;&lt;LI-CODE lang="python"&gt;from pyspark.sql import SparkSession
import json
import os
from typing import Dict, Any

# Mapeamento PostgreSQL → Spark SQL Types
POSTGRES_TYPE_MAPPING = {
    'int': 'IntegerType',
    'integer': 'IntegerType',
    'bigint': 'IntegerType',
    'smallint': 'IntegerType',
    'tinyint': 'IntegerType',
    'varchar': 'StringType',
    'character varying': 'StringType',
    'char': 'StringType',
    'text': 'StringType',
    'boolean': 'BooleanType',
    'double': 'DoubleType',
    'float': 'FloatType',
    'real': 'FloatType',
    'numeric': 'DecimalType',
    'decimal': 'DecimalType',
    'date': 'DateType',
    'timestamp': 'TimestampType',
    'timestamp without time zone': 'TimestampType',
    'bytea': 'BinaryType'
}

def adjust_dtype(dtype_str: str) -&amp;gt; str:
    """
    Maps PostgreSQL data types to Spark SQL Types.
    Falls back to StringType for unknown types.
    """
    base_type = dtype_str.lower().strip()
    mapped = POSTGRES_TYPE_MAPPING.get(base_type)
    if not mapped:
        print(f"[WARN] Unmapped type: {base_type}. Defaulting to StringType.")
        return "StringType"
    return mapped

def print_formatted_postgres_schema(schema_dict: Dict[str, Any], table_name: str):
    """
    Prints schema in a formatted dictionary style for readability.
    """
    print(f"{table_name}_schema = {{")
    for field, props in schema_dict.items():
        print(f"    '{field}': {{")
        print(f"        'size': {props['size']},")
        print(f"        'dtype': '{props['dtype']}',")
        print(f"        'unique': {props['unique']},")
        print(f"        'nullable': {props['nullable']}")
        print(f"    }},")
    print("}")

def generate_postgres_schema_dict(
    jdbc_url: str,
    schema: str,
    table: str,
    secret_scope: str,
    export_path: str = None
) -&amp;gt; Dict[str, Any]:
    """
    Generates schema dictionary from PostgreSQL metadata.
    Optionally exports schema as JSON file to DBFS or local path.
    """

    user = dbutils.secrets.get(scope=secret_scope, key="postgres_user")
    password = dbutils.secrets.get(scope=secret_scope, key="postgres_password")

    query = f"""
    SELECT 
        column_name,
        is_nullable = 'YES' AS nullable,
        EXISTS (
            SELECT 1
            FROM information_schema.table_constraints tc
            JOIN information_schema.constraint_column_usage ccu
              ON tc.constraint_name = ccu.constraint_name
            WHERE tc.constraint_type = 'UNIQUE'
              AND tc.table_schema = cols.table_schema
              AND tc.table_name = cols.table_name
              AND ccu.column_name = cols.column_name
        ) AS is_unique,
        data_type,
        character_maximum_length,
        numeric_precision
    FROM information_schema.columns cols
    WHERE table_schema = '{schema}' AND table_name = '{table}'
    """

    # Lê metadados da tabela com JDBC usando apenas .option()
    df_metadata = (
        spark.read.format("jdbc")
        .option("url", jdbc_url)
        .option("query", query)
        .option("user", user)
        .option("password", password)
        .option("driver", "org.postgresql.Driver")
        .load()
    )

    schema_dict = {}
    for row in df_metadata.toLocalIterator():  # mais seguro que collect()
        column = row["column_name"]
        data_type = row["data_type"]
        size = row["character_maximum_length"] or row["numeric_precision"] or None

        schema_dict[column] = {
            "size": size,
            "dtype": adjust_dtype(data_type),
            "unique": row["is_unique"],
            "nullable": row["nullable"]
        }
    
    print_formatted_postgres_schema(schema_dict, table)

    # Optional JSON export
    #if export_path:
    #    with open(export_path, "w") as f:
    #        json.dump(schema_dict, f, indent=4)
    #    print(f"[INFO] Schema exported to {export_path}")

    return schema_dict&lt;/LI-CODE&gt;&lt;P&gt;&lt;span class="lia-inline-image-display-wrapper lia-image-align-inline" image-alt="WiliamRosa_0-1755612791624.png" style="width: 400px;"&gt;&lt;img src="https://community.databricks.com/t5/image/serverpage/image-id/19193iE1E00777383968E7/image-size/medium?v=v2&amp;amp;px=400" role="button" title="WiliamRosa_0-1755612791624.png" alt="WiliamRosa_0-1755612791624.png" /&gt;&lt;/span&gt;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&lt;BR /&gt;&lt;BR /&gt;&lt;/P&gt;</description>
      <pubDate>Tue, 19 Aug 2025 14:13:41 GMT</pubDate>
      <guid>https://community.databricks.com/t5/community-articles/generating-a-postgresql-table-schema-for-etl-in-databricks/m-p/128874#M561</guid>
      <dc:creator>WiliamRosa</dc:creator>
      <dc:date>2025-08-19T14:13:41Z</dc:date>
    </item>
    <item>
      <title>Re: Generating a PostgreSQL Table Schema for ETL in Databricks</title>
      <link>https://community.databricks.com/t5/community-articles/generating-a-postgresql-table-schema-for-etl-in-databricks/m-p/128878#M562</link>
      <description>&lt;P&gt;Suggestions are welcome. I hope this was helpful.&lt;/P&gt;</description>
      <pubDate>Tue, 19 Aug 2025 14:21:30 GMT</pubDate>
      <guid>https://community.databricks.com/t5/community-articles/generating-a-postgresql-table-schema-for-etl-in-databricks/m-p/128878#M562</guid>
      <dc:creator>WiliamRosa</dc:creator>
      <dc:date>2025-08-19T14:21:30Z</dc:date>
    </item>
    <item>
      <title>Re: Generating a PostgreSQL Table Schema for ETL in Databricks</title>
      <link>https://community.databricks.com/t5/community-articles/generating-a-postgresql-table-schema-for-etl-in-databricks/m-p/128886#M563</link>
      <description>&lt;P&gt;Thanks for sharing. One tip for you, next time if you have something you'd like to share with community we have dedicated place for that: Knowledge Sharing Hub.&lt;/P&gt;&lt;P&gt;Another tip: marking a post as the best answer should only apply to threads where someone defines a problem or asks a question, and your reply provides a solution. If you want to share knowledge or an interesting case from a project - use the knowledge sharing hub.&lt;/P&gt;&lt;P&gt;Otherwise, it will simply be considered as overuse of the system. And why? To give you example, a user could post 30 different tips in one day and mark them all as the best answer, unfairly moving up the leaderboard.&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Tue, 19 Aug 2025 15:00:44 GMT</pubDate>
      <guid>https://community.databricks.com/t5/community-articles/generating-a-postgresql-table-schema-for-etl-in-databricks/m-p/128886#M563</guid>
      <dc:creator>szymon_dybczak</dc:creator>
      <dc:date>2025-08-19T15:00:44Z</dc:date>
    </item>
    <item>
      <title>Re: Generating a PostgreSQL Table Schema for ETL in Databricks</title>
      <link>https://community.databricks.com/t5/community-articles/generating-a-postgresql-table-schema-for-etl-in-databricks/m-p/128888#M564</link>
      <description>&lt;P&gt;Thanks for the tips, bro. I’m new to the community and still learning.&lt;/P&gt;</description>
      <pubDate>Tue, 19 Aug 2025 15:02:49 GMT</pubDate>
      <guid>https://community.databricks.com/t5/community-articles/generating-a-postgresql-table-schema-for-etl-in-databricks/m-p/128888#M564</guid>
      <dc:creator>WiliamRosa</dc:creator>
      <dc:date>2025-08-19T15:02:49Z</dc:date>
    </item>
    <item>
      <title>Re: Generating a PostgreSQL Table Schema for ETL in Databricks</title>
      <link>https://community.databricks.com/t5/community-articles/generating-a-postgresql-table-schema-for-etl-in-databricks/m-p/128889#M565</link>
      <description>&lt;P&gt;No problem, that's why I'm sharing some tips with you, so you can avoid problem in future.&lt;/P&gt;</description>
      <pubDate>Tue, 19 Aug 2025 15:03:49 GMT</pubDate>
      <guid>https://community.databricks.com/t5/community-articles/generating-a-postgresql-table-schema-for-etl-in-databricks/m-p/128889#M565</guid>
      <dc:creator>szymon_dybczak</dc:creator>
      <dc:date>2025-08-19T15:03:49Z</dc:date>
    </item>
    <item>
      <title>Re: Generating a PostgreSQL Table Schema for ETL in Databricks</title>
      <link>https://community.databricks.com/t5/community-articles/generating-a-postgresql-table-schema-for-etl-in-databricks/m-p/128890#M566</link>
      <description>&lt;P&gt;tks so much&lt;/P&gt;</description>
      <pubDate>Tue, 19 Aug 2025 15:04:40 GMT</pubDate>
      <guid>https://community.databricks.com/t5/community-articles/generating-a-postgresql-table-schema-for-etl-in-databricks/m-p/128890#M566</guid>
      <dc:creator>WiliamRosa</dc:creator>
      <dc:date>2025-08-19T15:04:40Z</dc:date>
    </item>
  </channel>
</rss>

