<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Re: need to ingest millions of csv files from aws s3 in Data Engineering</title>
    <link>https://community.databricks.com/t5/data-engineering/need-to-ingest-millions-of-csv-files-from-aws-s3/m-p/54876#M30181</link>
    <description>&lt;P&gt;Hi&amp;nbsp;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/88298"&gt;@Kumarashokjmu&lt;/a&gt;,&lt;/P&gt;
&lt;P&gt;I would recommend to use Databricks auto loader to ingest your CSV files incrementally. You can find examples and more details here&amp;nbsp;&lt;A href="https://docs.databricks.com/en/ingestion/auto-loader/index.html#what-is-auto-loader" target="_blank"&gt;https://docs.databricks.com/en/ingestion/auto-loader/index.html#what-is-auto-loader&lt;/A&gt;&lt;/P&gt;</description>
    <pubDate>Thu, 07 Dec 2023 17:59:06 GMT</pubDate>
    <dc:creator>jose_gonzalez</dc:creator>
    <dc:date>2023-12-07T17:59:06Z</dc:date>
    <item>
      <title>need to ingest millions of csv files from aws s3</title>
      <link>https://community.databricks.com/t5/data-engineering/need-to-ingest-millions-of-csv-files-from-aws-s3/m-p/54689#M30145</link>
      <description>&lt;P&gt;I have a need to ingest millions of csv files from aws s3 bucket. I am facing issue with aws s3 throttling issue and besides notebook process is running for 8 hours plus and sometimes failing. When looking at cluster performance, it is utilized 60%.&lt;/P&gt;&lt;P&gt;I need suggestions on avoiding throttling by aws and what should be source filesize if I have to combine small files to bigger for processing, speeding up ingestion and any other spark parameter needs tuning.&lt;/P&gt;&lt;P&gt;Thanks in advance.&lt;/P&gt;&lt;P&gt;Ash&lt;/P&gt;</description>
      <pubDate>Tue, 05 Dec 2023 22:20:03 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/need-to-ingest-millions-of-csv-files-from-aws-s3/m-p/54689#M30145</guid>
      <dc:creator>Kumarashokjmu</dc:creator>
      <dc:date>2023-12-05T22:20:03Z</dc:date>
    </item>
    <item>
      <title>Re: need to ingest millions of csv files from aws s3</title>
      <link>https://community.databricks.com/t5/data-engineering/need-to-ingest-millions-of-csv-files-from-aws-s3/m-p/54803#M30166</link>
      <description>&lt;P&gt;Thank you so much Kaniz, I Really appreciate your response with detail reply on each topic. I will post more with time to get help from you.&lt;/P&gt;&lt;P&gt;Ashok&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Wed, 06 Dec 2023 18:11:58 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/need-to-ingest-millions-of-csv-files-from-aws-s3/m-p/54803#M30166</guid>
      <dc:creator>Kumarashokjmu</dc:creator>
      <dc:date>2023-12-06T18:11:58Z</dc:date>
    </item>
    <item>
      <title>Re: need to ingest millions of csv files from aws s3</title>
      <link>https://community.databricks.com/t5/data-engineering/need-to-ingest-millions-of-csv-files-from-aws-s3/m-p/54876#M30181</link>
      <description>&lt;P&gt;Hi&amp;nbsp;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/88298"&gt;@Kumarashokjmu&lt;/a&gt;,&lt;/P&gt;
&lt;P&gt;I would recommend to use Databricks auto loader to ingest your CSV files incrementally. You can find examples and more details here&amp;nbsp;&lt;A href="https://docs.databricks.com/en/ingestion/auto-loader/index.html#what-is-auto-loader" target="_blank"&gt;https://docs.databricks.com/en/ingestion/auto-loader/index.html#what-is-auto-loader&lt;/A&gt;&lt;/P&gt;</description>
      <pubDate>Thu, 07 Dec 2023 17:59:06 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/need-to-ingest-millions-of-csv-files-from-aws-s3/m-p/54876#M30181</guid>
      <dc:creator>jose_gonzalez</dc:creator>
      <dc:date>2023-12-07T17:59:06Z</dc:date>
    </item>
    <item>
      <title>Re: need to ingest millions of csv files from aws s3</title>
      <link>https://community.databricks.com/t5/data-engineering/need-to-ingest-millions-of-csv-files-from-aws-s3/m-p/55155#M30246</link>
      <description>&lt;P&gt;Hi&amp;nbsp;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/88298"&gt;@Kumarashokjmu&lt;/a&gt;,&lt;/P&gt;
&lt;P&gt;Just a friendly follow-up. Did you have time to test auto loader? do you have any follow-up questions? Please let us know&lt;/P&gt;</description>
      <pubDate>Tue, 12 Dec 2023 19:15:29 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/need-to-ingest-millions-of-csv-files-from-aws-s3/m-p/55155#M30246</guid>
      <dc:creator>jose_gonzalez</dc:creator>
      <dc:date>2023-12-12T19:15:29Z</dc:date>
    </item>
    <item>
      <title>Re: need to ingest millions of csv files from aws s3</title>
      <link>https://community.databricks.com/t5/data-engineering/need-to-ingest-millions-of-csv-files-from-aws-s3/m-p/55176#M30257</link>
      <description>&lt;P&gt;If you want to load all the data at once use autoloader or DLT pipeline with directory listing if files are lexically ordered.&amp;nbsp;&lt;BR /&gt;&lt;BR /&gt;OR&lt;BR /&gt;If you want to perform incremental load, divide the load into two job like historic data load vs live data load:&lt;BR /&gt;Live data:&lt;BR /&gt;Use autoloader or delta live pipeline using fileNotification to load the data into Delta table. File Notification is scalable and recommended solution from Databricks.&amp;nbsp;&lt;BR /&gt;&lt;A class="" href="https://docs.databricks.com/en/ingestion/auto-loader/options.html#directory-listing-options" target="_blank" rel="nofollow noopener"&gt;https://docs.databricks.com/en/ingestion/auto-loader/options.html#directory-listing-options&lt;/A&gt;&lt;BR /&gt;&lt;BR /&gt;Historic Load:&amp;nbsp;&lt;BR /&gt;Use autoloader job to load all the data. if files are not lexically ordered then try using s3 inventory option to divide the workload into micro-batches. Using this approach multiple batches can be executed in parallel.&lt;BR /&gt;&lt;BR /&gt;Handle S3 throttling issues:&lt;BR /&gt;if you can facing issue with s3 throttling. Try limit maxFilesPerTrigger to 10k-15k.&amp;nbsp;&lt;BR /&gt;Increase spark.network.timeout configuration in spark init block.&lt;BR /&gt;&lt;BR /&gt;Let us know if you need more information&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Wed, 13 Dec 2023 01:54:56 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/need-to-ingest-millions-of-csv-files-from-aws-s3/m-p/55176#M30257</guid>
      <dc:creator>kulkpd</dc:creator>
      <dc:date>2023-12-13T01:54:56Z</dc:date>
    </item>
  </channel>
</rss>

