<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Best Cluster Setup for intensive transformation workload in Data Engineering</title>
    <link>https://community.databricks.com/t5/data-engineering/best-cluster-setup-for-intensive-transformation-workload/m-p/39401#M26961</link>
    <description>&lt;P&gt;I have a pyspark dataframe, 61k rows, 3 columns, one of which is a string column which has a max length of 4k. I'm doing about 100 different regexp_replace functions on this dataframe, so, very resource intensive. I'm trying to write this to a delta table, but it seems no matter what compute I use I can't seem to get it to run in an hour. I know the code works because I limited it to 500 rows to test and it ran in about 30 seconds, so I know it just has to do with the magnitude of the data. Has anyone done something on this scale before, and do you know how I get this to run in an hour without breaking the bank?&lt;/P&gt;</description>
    <pubDate>Tue, 08 Aug 2023 18:35:12 GMT</pubDate>
    <dc:creator>AChang</dc:creator>
    <dc:date>2023-08-08T18:35:12Z</dc:date>
    <item>
      <title>Best Cluster Setup for intensive transformation workload</title>
      <link>https://community.databricks.com/t5/data-engineering/best-cluster-setup-for-intensive-transformation-workload/m-p/39401#M26961</link>
      <description>&lt;P&gt;I have a pyspark dataframe, 61k rows, 3 columns, one of which is a string column which has a max length of 4k. I'm doing about 100 different regexp_replace functions on this dataframe, so, very resource intensive. I'm trying to write this to a delta table, but it seems no matter what compute I use I can't seem to get it to run in an hour. I know the code works because I limited it to 500 rows to test and it ran in about 30 seconds, so I know it just has to do with the magnitude of the data. Has anyone done something on this scale before, and do you know how I get this to run in an hour without breaking the bank?&lt;/P&gt;</description>
      <pubDate>Tue, 08 Aug 2023 18:35:12 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/best-cluster-setup-for-intensive-transformation-workload/m-p/39401#M26961</guid>
      <dc:creator>AChang</dc:creator>
      <dc:date>2023-08-08T18:35:12Z</dc:date>
    </item>
    <item>
      <title>Re: Best Cluster Setup for intensive transformation workload</title>
      <link>https://community.databricks.com/t5/data-engineering/best-cluster-setup-for-intensive-transformation-workload/m-p/39447#M26983</link>
      <description>&lt;P&gt;It seems that you're trying to apply a lot of transformations, but it's basic stuff, so I'd go for the best practices documentation and find a way to create a compute-optimized cluster.&lt;/P&gt;&lt;P&gt;Ref.:&amp;nbsp;&lt;A href="https://docs.databricks.com/en/clusters/cluster-config-best-practices.html#basic-batch-etl" target="_blank"&gt;https://docs.databricks.com/en/clusters/cluster-config-best-practices.html#basic-batch-etl&lt;/A&gt;&lt;/P&gt;</description>
      <pubDate>Wed, 09 Aug 2023 14:14:16 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/best-cluster-setup-for-intensive-transformation-workload/m-p/39447#M26983</guid>
      <dc:creator>Leonardo</dc:creator>
      <dc:date>2023-08-09T14:14:16Z</dc:date>
    </item>
  </channel>
</rss>

