<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Small json files issue . taking 2 hours to read 3000 files in Data Engineering</title>
    <link>https://community.databricks.com/t5/data-engineering/small-json-files-issue-taking-2-hours-to-read-3000-files/m-p/98627#M39765</link>
    <description>&lt;P&gt;Hello I am trying to read 3000 json files which has only one records. It is taking 2 hours to read all the files . How can I perform this operation faster pls suggest.&lt;/P&gt;</description>
    <pubDate>Wed, 13 Nov 2024 07:39:29 GMT</pubDate>
    <dc:creator>Subhasis</dc:creator>
    <dc:date>2024-11-13T07:39:29Z</dc:date>
    <item>
      <title>Small json files issue . taking 2 hours to read 3000 files</title>
      <link>https://community.databricks.com/t5/data-engineering/small-json-files-issue-taking-2-hours-to-read-3000-files/m-p/98627#M39765</link>
      <description>&lt;P&gt;Hello I am trying to read 3000 json files which has only one records. It is taking 2 hours to read all the files . How can I perform this operation faster pls suggest.&lt;/P&gt;</description>
      <pubDate>Wed, 13 Nov 2024 07:39:29 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/small-json-files-issue-taking-2-hours-to-read-3000-files/m-p/98627#M39765</guid>
      <dc:creator>Subhasis</dc:creator>
      <dc:date>2024-11-13T07:39:29Z</dc:date>
    </item>
    <item>
      <title>Re: Small json files issue . taking 2 hours to read 3000 files</title>
      <link>https://community.databricks.com/t5/data-engineering/small-json-files-issue-taking-2-hours-to-read-3000-files/m-p/98629#M39767</link>
      <description>&lt;P&gt;&lt;SPAN&gt;&lt;SPAN class=""&gt;This is the code ---df1 = spark.read.format("json").options(inferSchema="true", multiLine="true").load(file1)&lt;/SPAN&gt;&lt;/SPAN&gt;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Wed, 13 Nov 2024 08:05:47 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/small-json-files-issue-taking-2-hours-to-read-3000-files/m-p/98629#M39767</guid>
      <dc:creator>Subhasis</dc:creator>
      <dc:date>2024-11-13T08:05:47Z</dc:date>
    </item>
    <item>
      <title>Re: Small json files issue . taking 2 hours to read 3000 files</title>
      <link>https://community.databricks.com/t5/data-engineering/small-json-files-issue-taking-2-hours-to-read-3000-files/m-p/98648#M39778</link>
      <description>&lt;P&gt;Hi&amp;nbsp;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/119085"&gt;@Subhasis&lt;/a&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;You can start off by specyfying schema upfront instead of using infer schema option. But to be honest, it is classical "small file problem". The best approach you can take is to compact those small files into larges ones.&amp;nbsp;&lt;BR /&gt;Or you can read all them and save them as a parquet files with a proper partition size.&lt;BR /&gt;Take a look at below threads for inspiration:&lt;/P&gt;&lt;P&gt;&lt;A href="https://garrens.com/blog/2017/11/04/big-data-spark-and-its-small-files-problem/?unapproved=252&amp;amp;moderation-hash=5a657350c6169448d65209caa52d5d2c#comment-252" target="_blank"&gt;Big data [Spark] and its small files problem – Garren's [Big] Data Blog&lt;/A&gt;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&lt;A href="https://stackoverflow.com/questions/65139525/reading-millions-of-small-json-files-from-s3-bucket-in-pyspark-very-slow" target="_blank"&gt;apache spark - Reading Millions of Small JSON Files from S3 Bucket in PySpark Very Slow - Stack Overflow&lt;/A&gt;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Wed, 13 Nov 2024 11:30:43 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/small-json-files-issue-taking-2-hours-to-read-3000-files/m-p/98648#M39778</guid>
      <dc:creator>szymon_dybczak</dc:creator>
      <dc:date>2024-11-13T11:30:43Z</dc:date>
    </item>
  </channel>
</rss>

