<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Re: How to process all Azure storage file from Databricks in Data Engineering</title>
    <link>https://community.databricks.com/t5/data-engineering/how-to-process-all-azure-storage-file-from-databricks/m-p/37876#M26496</link>
    <description>&lt;PRE&gt;df = spark.read.csv("/mnt/lake/data/csv")&lt;/PRE&gt;&lt;P&gt;Here I assume "/mnt/lake/data/csv" is the directory with the 5 files.&lt;BR /&gt;spark.read.csv also has some options like the separator, header etc:&lt;BR /&gt;&lt;A href="https://spark.apache.org/docs/latest/sql-data-sources-csv.html" target="_self"&gt;https://spark.apache.org/docs/latest/sql-data-sources-csv.html&lt;/A&gt;&lt;/P&gt;&lt;P&gt;So there is no need to do this one by one, read the whole dir in one go.&lt;/P&gt;</description>
    <pubDate>Tue, 18 Jul 2023 15:18:20 GMT</pubDate>
    <dc:creator>-werners-</dc:creator>
    <dc:date>2023-07-18T15:18:20Z</dc:date>
    <item>
      <title>How to process all Azure storage file from Databricks</title>
      <link>https://community.databricks.com/t5/data-engineering/how-to-process-all-azure-storage-file-from-databricks/m-p/37871#M26491</link>
      <description>&lt;P&gt;Hi,&lt;/P&gt;&lt;P&gt;I want to process all files that are in my azure storage using databricks, What is the process?&lt;/P&gt;</description>
      <pubDate>Tue, 18 Jul 2023 14:34:20 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/how-to-process-all-azure-storage-file-from-databricks/m-p/37871#M26491</guid>
      <dc:creator>bchaubey</dc:creator>
      <dc:date>2023-07-18T14:34:20Z</dc:date>
    </item>
    <item>
      <title>Re: How to process all Azure storage file from Databricks</title>
      <link>https://community.databricks.com/t5/data-engineering/how-to-process-all-azure-storage-file-from-databricks/m-p/37872#M26492</link>
      <description>&lt;P&gt;It depends on what you mean by 'process'.&lt;BR /&gt;Spark can read several files at once.&amp;nbsp; All you need is the path to a directory with files.&lt;BR /&gt;Then you can read the whole directory using spark.read.parquet/csv/json/... (depends on your file format).&lt;/P&gt;&lt;P&gt;It is important however that all files have the same schema (columns), otherwise this approach will not work.&lt;/P&gt;&lt;P&gt;Is this what you are looking for? Or do you also need help with linking your data lake to databricks?&lt;/P&gt;</description>
      <pubDate>Tue, 18 Jul 2023 14:59:58 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/how-to-process-all-azure-storage-file-from-databricks/m-p/37872#M26492</guid>
      <dc:creator>-werners-</dc:creator>
      <dc:date>2023-07-18T14:59:58Z</dc:date>
    </item>
    <item>
      <title>Re: How to process all Azure storage file from Databricks</title>
      <link>https://community.databricks.com/t5/data-engineering/how-to-process-all-azure-storage-file-from-databricks/m-p/37874#M26494</link>
      <description>&lt;P&gt;in attachment only one file that is 003.csv. Suppose i have 5 files and all schema are same. How can load in dataframe one by one?&lt;/P&gt;</description>
      <pubDate>Tue, 18 Jul 2023 15:14:29 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/how-to-process-all-azure-storage-file-from-databricks/m-p/37874#M26494</guid>
      <dc:creator>bchaubey</dc:creator>
      <dc:date>2023-07-18T15:14:29Z</dc:date>
    </item>
    <item>
      <title>Re: How to process all Azure storage file from Databricks</title>
      <link>https://community.databricks.com/t5/data-engineering/how-to-process-all-azure-storage-file-from-databricks/m-p/37876#M26496</link>
      <description>&lt;PRE&gt;df = spark.read.csv("/mnt/lake/data/csv")&lt;/PRE&gt;&lt;P&gt;Here I assume "/mnt/lake/data/csv" is the directory with the 5 files.&lt;BR /&gt;spark.read.csv also has some options like the separator, header etc:&lt;BR /&gt;&lt;A href="https://spark.apache.org/docs/latest/sql-data-sources-csv.html" target="_self"&gt;https://spark.apache.org/docs/latest/sql-data-sources-csv.html&lt;/A&gt;&lt;/P&gt;&lt;P&gt;So there is no need to do this one by one, read the whole dir in one go.&lt;/P&gt;</description>
      <pubDate>Tue, 18 Jul 2023 15:18:20 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/how-to-process-all-azure-storage-file-from-databricks/m-p/37876#M26496</guid>
      <dc:creator>-werners-</dc:creator>
      <dc:date>2023-07-18T15:18:20Z</dc:date>
    </item>
    <item>
      <title>Re: How to process all Azure storage file from Databricks</title>
      <link>https://community.databricks.com/t5/data-engineering/how-to-process-all-azure-storage-file-from-databricks/m-p/37877#M26497</link>
      <description>&lt;P&gt;Could you&amp;nbsp; please provide me the code with my scenario&lt;/P&gt;</description>
      <pubDate>Tue, 18 Jul 2023 15:25:42 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/how-to-process-all-azure-storage-file-from-databricks/m-p/37877#M26497</guid>
      <dc:creator>bchaubey</dc:creator>
      <dc:date>2023-07-18T15:25:42Z</dc:date>
    </item>
    <item>
      <title>Re: How to process all Azure storage file from Databricks</title>
      <link>https://community.databricks.com/t5/data-engineering/how-to-process-all-azure-storage-file-from-databricks/m-p/37928#M26510</link>
      <description>&lt;P&gt;well, my previous post kinda is the code.&lt;BR /&gt;The dataframe will read all files in this directory.&lt;BR /&gt;What else do you need?&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Wed, 19 Jul 2023 07:15:50 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/how-to-process-all-azure-storage-file-from-databricks/m-p/37928#M26510</guid>
      <dc:creator>-werners-</dc:creator>
      <dc:date>2023-07-19T07:15:50Z</dc:date>
    </item>
  </channel>
</rss>

