<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic cloning the data between two catalogs in Data Engineering</title>
    <link>https://community.databricks.com/t5/data-engineering/cloning-the-data-between-two-catalogs/m-p/100322#M40266</link>
    <description>&lt;P&gt;Hello community,&lt;/P&gt;&lt;P&gt;I was writing this piece of code to do the data migration between two catalogs:&lt;BR /&gt;&lt;BR /&gt;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;LI-CODE lang="markup"&gt;        # Read data and partitioning
        print(f"Loading {table_name} from production catalog...")
        prod_df_table_name = f"prod_catalog.`00_bronze_layer`.pg_{table_name}"
        prod_df_table = spark.read.table(prod_df_table_name).persist(StorageLevel.MEMORY_AND_DISK)

        # Write with optimizations
        print(f"Saving {table_name} to Bronze Schema")
        stg_df_table_name = f"stg_catalog.`00_bronze_layer`.pg_{table_name}"
        if partition_col in ["all", "none", "export_run_timestamp", "updated_at", "received_at"]:
            prod_df_table.write \
                .format("delta") \
                .mode("overwrite") \
                .option("mergeSchema", "true") \
                .saveAsTable(stg_df_table_name)
        else:
            prod_df_table.write \
                .format("delta") \
                .mode("overwrite") \
                .partitionBy(partition_col) \
                .option("mergeSchema", "true") \
                .saveAsTable(stg_df_table_name)

        prod_df_table.unpersist()&lt;/LI-CODE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;I also using a little cluster with 14 gb and 4 cores without any worker node, any suggestion to improve the speed of copy :(? I'm going to work with tables that have sometimes 10 gigabytes of data&lt;/P&gt;</description>
    <pubDate>Thu, 28 Nov 2024 11:17:55 GMT</pubDate>
    <dc:creator>jeremy98</dc:creator>
    <dc:date>2024-11-28T11:17:55Z</dc:date>
    <item>
      <title>cloning the data between two catalogs</title>
      <link>https://community.databricks.com/t5/data-engineering/cloning-the-data-between-two-catalogs/m-p/100322#M40266</link>
      <description>&lt;P&gt;Hello community,&lt;/P&gt;&lt;P&gt;I was writing this piece of code to do the data migration between two catalogs:&lt;BR /&gt;&lt;BR /&gt;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;LI-CODE lang="markup"&gt;        # Read data and partitioning
        print(f"Loading {table_name} from production catalog...")
        prod_df_table_name = f"prod_catalog.`00_bronze_layer`.pg_{table_name}"
        prod_df_table = spark.read.table(prod_df_table_name).persist(StorageLevel.MEMORY_AND_DISK)

        # Write with optimizations
        print(f"Saving {table_name} to Bronze Schema")
        stg_df_table_name = f"stg_catalog.`00_bronze_layer`.pg_{table_name}"
        if partition_col in ["all", "none", "export_run_timestamp", "updated_at", "received_at"]:
            prod_df_table.write \
                .format("delta") \
                .mode("overwrite") \
                .option("mergeSchema", "true") \
                .saveAsTable(stg_df_table_name)
        else:
            prod_df_table.write \
                .format("delta") \
                .mode("overwrite") \
                .partitionBy(partition_col) \
                .option("mergeSchema", "true") \
                .saveAsTable(stg_df_table_name)

        prod_df_table.unpersist()&lt;/LI-CODE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;I also using a little cluster with 14 gb and 4 cores without any worker node, any suggestion to improve the speed of copy :(? I'm going to work with tables that have sometimes 10 gigabytes of data&lt;/P&gt;</description>
      <pubDate>Thu, 28 Nov 2024 11:17:55 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/cloning-the-data-between-two-catalogs/m-p/100322#M40266</guid>
      <dc:creator>jeremy98</dc:creator>
      <dc:date>2024-11-28T11:17:55Z</dc:date>
    </item>
    <item>
      <title>Re: cloning the data between two catalogs</title>
      <link>https://community.databricks.com/t5/data-engineering/cloning-the-data-between-two-catalogs/m-p/100339#M40267</link>
      <description>&lt;P&gt;FYI,&lt;BR /&gt;I did it increasing the size of the cluster using much cores and directly written:&lt;BR /&gt;&lt;BR /&gt;&lt;/P&gt;&lt;DIV&gt;&lt;DIV&gt;&lt;SPAN&gt;prod_df_table.write \&lt;/SPAN&gt;&lt;/DIV&gt;&lt;DIV&gt;&lt;SPAN&gt;.&lt;/SPAN&gt;&lt;SPAN&gt;format&lt;/SPAN&gt;&lt;SPAN&gt;(&lt;/SPAN&gt;&lt;SPAN&gt;"delta"&lt;/SPAN&gt;&lt;SPAN&gt;) \&lt;/SPAN&gt;&lt;/DIV&gt;&lt;DIV&gt;&lt;SPAN&gt;.&lt;/SPAN&gt;&lt;SPAN&gt;mode&lt;/SPAN&gt;&lt;SPAN&gt;(&lt;/SPAN&gt;&lt;SPAN&gt;"overwrite"&lt;/SPAN&gt;&lt;SPAN&gt;) \&lt;/SPAN&gt;&lt;/DIV&gt;&lt;DIV&gt;&lt;SPAN&gt;.&lt;/SPAN&gt;&lt;SPAN&gt;saveAsTable&lt;/SPAN&gt;&lt;SPAN&gt;(stg_df_table_name)&lt;/SPAN&gt;&lt;/DIV&gt;&lt;/DIV&gt;</description>
      <pubDate>Thu, 28 Nov 2024 13:47:15 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/cloning-the-data-between-two-catalogs/m-p/100339#M40267</guid>
      <dc:creator>jeremy98</dc:creator>
      <dc:date>2024-11-28T13:47:15Z</dc:date>
    </item>
  </channel>
</rss>

