<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Efficient Detection of Schema Mismatch in CSV Files During Single Pass Reading in Get Started Discussions</title>
    <link>https://community.databricks.com/t5/get-started-discussions/efficient-detection-of-schema-mismatch-in-csv-files-during/m-p/55857#M2074</link>
    <description>&lt;P&gt;Hello, when I read a CSV file with a schema object, if a column in the original CSV contains a value of a different datatype than specified in the schema, the result is a null cell. Is there an efficient way to identify these cases without having to read the CSV file twice—first with the inferSchema option set to FALSE (reading all columns as strings) and then again with a schema object—followed by comparing the count of nulls for each column?&lt;/P&gt;&lt;P&gt;Thank you&lt;/P&gt;</description>
    <pubDate>Wed, 27 Dec 2023 19:01:50 GMT</pubDate>
    <dc:creator>reuvenk121</dc:creator>
    <dc:date>2023-12-27T19:01:50Z</dc:date>
    <item>
      <title>Efficient Detection of Schema Mismatch in CSV Files During Single Pass Reading</title>
      <link>https://community.databricks.com/t5/get-started-discussions/efficient-detection-of-schema-mismatch-in-csv-files-during/m-p/55857#M2074</link>
      <description>&lt;P&gt;Hello, when I read a CSV file with a schema object, if a column in the original CSV contains a value of a different datatype than specified in the schema, the result is a null cell. Is there an efficient way to identify these cases without having to read the CSV file twice—first with the inferSchema option set to FALSE (reading all columns as strings) and then again with a schema object—followed by comparing the count of nulls for each column?&lt;/P&gt;&lt;P&gt;Thank you&lt;/P&gt;</description>
      <pubDate>Wed, 27 Dec 2023 19:01:50 GMT</pubDate>
      <guid>https://community.databricks.com/t5/get-started-discussions/efficient-detection-of-schema-mismatch-in-csv-files-during/m-p/55857#M2074</guid>
      <dc:creator>reuvenk121</dc:creator>
      <dc:date>2023-12-27T19:01:50Z</dc:date>
    </item>
    <item>
      <title>Re: Efficient Detection of Schema Mismatch in CSV Files During Single Pass Reading</title>
      <link>https://community.databricks.com/t5/get-started-discussions/efficient-detection-of-schema-mismatch-in-csv-files-during/m-p/55875#M2077</link>
      <description>&lt;P&gt;Maybe you can try to read the data and let AutoLoader move missmatch data e.g. to rescueColumn&lt;/P&gt;&lt;P&gt;&lt;A href="https://learn.microsoft.com/en-us/azure/databricks/ingestion/auto-loader/schema#--what-is-the-rescued-data-column" target="_blank"&gt;https://learn.microsoft.com/en-us/azure/databricks/ingestion/auto-loader/schema#--what-is-the-rescued-data-column&lt;/A&gt;&lt;/P&gt;&lt;P&gt;Then you can decide what you do with rescue data.&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;If you think that there will be a lot of issues with data types, you can also try loading everything as STRING to BRONZE table, then create Silver table with specific schema and set up loading procedure.&lt;BR /&gt;You can then move bad records to badRecordsPatch in case of schema missmatch.&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Thu, 28 Dec 2023 09:18:06 GMT</pubDate>
      <guid>https://community.databricks.com/t5/get-started-discussions/efficient-detection-of-schema-mismatch-in-csv-files-during/m-p/55875#M2077</guid>
      <dc:creator>Wojciech_BUK</dc:creator>
      <dc:date>2023-12-28T09:18:06Z</dc:date>
    </item>
  </channel>
</rss>

