<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Re: Databricks Excel Reader in Community Articles</title>
    <link>https://community.databricks.com/t5/community-articles/databricks-excel-reader/m-p/142911#M925</link>
    <description>&lt;P&gt;Nice example!&lt;/P&gt;</description>
    <pubDate>Sat, 03 Jan 2026 21:55:41 GMT</pubDate>
    <dc:creator>Hubert-Dudek</dc:creator>
    <dc:date>2026-01-03T21:55:41Z</dc:date>
    <item>
      <title>Databricks Excel Reader</title>
      <link>https://community.databricks.com/t5/community-articles/databricks-excel-reader/m-p/142766#M920</link>
      <description>&lt;P&gt;&lt;SPAN&gt;We’ve recently created a new Excel reader function, and I decided to have a play around with it. I’ve used an open dataset for this tutorial, so you can follow along too.&lt;/SPAN&gt;&lt;/P&gt;
&lt;P&gt;&lt;SPAN&gt;Using file available here -&lt;/SPAN&gt;&lt;/P&gt;
&lt;P&gt;&lt;A href="https://www.ons.gov.uk/employmentandlabourmarket/peopleinwork/employmentandemployeetypes/datasets/labourdemandvolumesbystandardoccupationclassificationsoc2020uk" target="_blank"&gt;&lt;SPAN&gt;https://www.ons.gov.uk/employmentandlabourmarket/peopleinwork/employmentandemployeetypes/datasets/labourdemandvolumesbystandardoccupationclassificationsoc2020uk&lt;/SPAN&gt;&lt;/A&gt;&lt;/P&gt;
&lt;P&gt;&lt;SPAN&gt;Before you start ensure you have the beta for Excel ingestion available in your workspace:&lt;/SPAN&gt;&lt;/P&gt;
&lt;P&gt;&lt;span class="lia-inline-image-display-wrapper lia-image-align-inline" image-alt="emma_s_0-1767193777925.png" style="width: 400px;"&gt;&lt;img src="https://community.databricks.com/t5/image/serverpage/image-id/22606iFDCCF57E6E98F1E7/image-size/medium?v=v2&amp;amp;px=400" role="button" title="emma_s_0-1767193777925.png" alt="emma_s_0-1767193777925.png" /&gt;&lt;/span&gt;&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;&lt;BR /&gt;&lt;BR /&gt;&lt;/P&gt;
&lt;P&gt;&lt;SPAN&gt;Then upload the file you downloaded into the volume:&lt;/SPAN&gt;&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;&lt;span class="lia-inline-image-display-wrapper lia-image-align-inline" image-alt="emma_s_1-1767193777925.png" style="width: 400px;"&gt;&lt;img src="https://community.databricks.com/t5/image/serverpage/image-id/22608iFA9B1627BD1CE55E/image-size/medium?v=v2&amp;amp;px=400" role="button" title="emma_s_1-1767193777925.png" alt="emma_s_1-1767193777925.png" /&gt;&lt;/span&gt;&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;&lt;SPAN&gt;Once I’ve uploaded my file, I can simply read it into a dataframe, using the following:&lt;/SPAN&gt;&lt;/P&gt;
&lt;LI-CODE lang="python"&gt;df = (spark.read.excel("/Volumes/central_data/performance_data/excel_demo/labourdemandbyoccupation.xlsx"))
display(df)

&lt;/LI-CODE&gt;
&lt;P&gt;&lt;SPAN&gt;However, in this case this just gives me the first sheet, so instead I want to specify a data range, to do this I can do the following:&lt;/SPAN&gt;&lt;/P&gt;
&lt;LI-CODE lang="python"&gt;df = (spark.read

      .option("headerRows", 1)

      .option("dataAddress", "'Table 1'!A5:DD11")

      .excel("/Volumes/central_data/performance_data/excel_demo/labourdemandbyoccupation.xlsx"))

display(df)&lt;/LI-CODE&gt;
&lt;P&gt;&lt;SPAN&gt;Now I get a nice version of the data in Table 1.&lt;/SPAN&gt;&lt;/P&gt;
&lt;P&gt;&lt;span class="lia-inline-image-display-wrapper lia-image-align-inline" image-alt="emma_s_2-1767193777925.png" style="width: 400px;"&gt;&lt;img src="https://community.databricks.com/t5/image/serverpage/image-id/22607i0CC8107A964C1A9E/image-size/medium?v=v2&amp;amp;px=400" role="button" title="emma_s_2-1767193777925.png" alt="emma_s_2-1767193777925.png" /&gt;&lt;/span&gt;&lt;/P&gt;
&lt;P&gt;&lt;SPAN&gt;I can also do this in SQL&amp;nbsp;&lt;/SPAN&gt;&lt;/P&gt;
&lt;LI-CODE lang="python"&gt;%sql

-- Read a specific sheet and range

CREATE TABLE central_Data.performance_data.sheet1table AS

SELECT * FROM read_files(

 "/Volumes/central_data/performance_data/excel_demo/labourdemandbyoccupation.xlsx",

 format =&amp;gt; "excel",

 headerRows =&amp;gt; 1,

 dataAddress =&amp;gt; "'Table 1'!A5:DD11",

 schemaEvolutionMode =&amp;gt; "none" )

&lt;/LI-CODE&gt;
&lt;P&gt;&lt;BR /&gt;&lt;SPAN&gt;But what if I want to loop through and return the data into tables for all sheets? Well then, I can use the sheets method to return all the names of sheets in my workbook&lt;/SPAN&gt;&lt;/P&gt;
&lt;LI-CODE lang="python"&gt;df = (spark.read.format("excel")

      .option("operation", "listSheets")

      .load("/Volumes/central_data/performance_data/excel_demo/labourdemandbyoccupation.xlsx"))

display(df)

&lt;/LI-CODE&gt;
&lt;P&gt;&lt;BR /&gt;&lt;SPAN&gt;I can then loop through them to import the sheets one at a time. You’ll notice I’ve excluded the ones I don’t want.&lt;/SPAN&gt;&lt;/P&gt;
&lt;P&gt;&lt;SPAN&gt;I’ve also not treated the title as a header row as it sometimes contains unallowed characters. I could import each of these and then tidy them up to rename my table.&lt;/SPAN&gt;&lt;/P&gt;
&lt;P&gt;&lt;BR /&gt;&lt;BR /&gt;&lt;/P&gt;
&lt;LI-CODE lang="python"&gt;sheet_names = [row['sheetName'] for row in df.collect()]



for sheet in sheet_names:

   if "table" in sheet.lower():

       data_df = (spark.read

                  .option("headerRows", 0)

                  .option("dataAddress", f"'{sheet}'!A5:DD5374")

                  .excel("/Volumes/central_data/performance_data/excel_demo/labourdemandbyoccupation.xlsx"))

       table_name = f"central_data.performance_data.{sheet.replace(' ', '_').lower()}"

       data_df.write.mode("overwrite").saveAsTable(table_name)&lt;/LI-CODE&gt;
&lt;P&gt;&lt;BR /&gt;&lt;BR /&gt;&lt;SPAN&gt;At the end I have a series of tables in my catalogue representing each table in the Excel workbook. However, now the names are messy and I have a lot of additional null rows. So to tidy this up I use the following code.&lt;/SPAN&gt;&lt;/P&gt;
&lt;P&gt;&lt;BR /&gt;&lt;BR /&gt;&lt;/P&gt;
&lt;LI-CODE lang="python"&gt;import re

from pyspark.sql.types import StructType, StructField, StringType

from pyspark.sql import functions as F



sheet_names = [row['sheetName'] for row in df.collect()]







for sheet in sheet_names:

   if "table" in sheet.lower() and sheet.lower() != "table of contents":

       table_name = f"central_data.performance_data.{sheet.replace(' ', '_').lower()}"

      

       # Drop the table if it exists

      

      

       raw_df = spark.table(table_name)

       first_row = raw_df.limit(1).collect()[0]

       new_columns = [clean_column_name(str(col)) for col in first_row]

       data_df = raw_df.tail(raw_df.count() - 1)

      

       schema = StructType([StructField(col, StringType(), True) for col in new_columns])

       data_df = spark.createDataFrame(data_df, schema=schema)

       cols_to_drop = [col for col in data_df.columns if col.lower() == "none"]

       data_df = data_df.drop(*cols_to_drop)

       non_null_df = data_df.na.drop(how="all")

       spark.sql(f"DROP TABLE IF EXISTS {table_name}")

       non_null_df.write.mode("overwrite").saveAsTable(table_name)&lt;/LI-CODE&gt;
&lt;P&gt;&lt;BR /&gt;&lt;SPAN&gt;And there you have it, a lovely cleaned-up table:&lt;/SPAN&gt;&lt;/P&gt;
&lt;P&gt;&lt;span class="lia-inline-image-display-wrapper lia-image-align-inline" image-alt="emma_s_3-1767193777926.png" style="width: 400px;"&gt;&lt;img src="https://community.databricks.com/t5/image/serverpage/image-id/22609i2F5AA88587C5DF87/image-size/medium?v=v2&amp;amp;px=400" role="button" title="emma_s_3-1767193777926.png" alt="emma_s_3-1767193777926.png" /&gt;&lt;/span&gt;&lt;/P&gt;
&lt;P&gt;&lt;SPAN&gt;It is also worth noting that these methods can all be used by Auto Loader, so I can use this to upload new Excel files as they arrive. I hope you’ve enjoyed this walkthrough. Let me know how you get on. Docs for the feature are available&lt;/SPAN&gt;&lt;A href="https://docs.databricks.com/aws/en/query/formats/excel" target="_blank"&gt;&lt;SPAN&gt; here&lt;/SPAN&gt;&lt;/A&gt;&lt;SPAN&gt;.&lt;/SPAN&gt;&lt;/P&gt;</description>
      <pubDate>Wed, 31 Dec 2025 15:14:28 GMT</pubDate>
      <guid>https://community.databricks.com/t5/community-articles/databricks-excel-reader/m-p/142766#M920</guid>
      <dc:creator>emma_s</dc:creator>
      <dc:date>2025-12-31T15:14:28Z</dc:date>
    </item>
    <item>
      <title>Re: Databricks Excel Reader</title>
      <link>https://community.databricks.com/t5/community-articles/databricks-excel-reader/m-p/142911#M925</link>
      <description>&lt;P&gt;Nice example!&lt;/P&gt;</description>
      <pubDate>Sat, 03 Jan 2026 21:55:41 GMT</pubDate>
      <guid>https://community.databricks.com/t5/community-articles/databricks-excel-reader/m-p/142911#M925</guid>
      <dc:creator>Hubert-Dudek</dc:creator>
      <dc:date>2026-01-03T21:55:41Z</dc:date>
    </item>
  </channel>
</rss>

