<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Re: DQ-Quality Check- what are the best method to validate the two parquet files . in Data Engineering</title>
    <link>https://community.databricks.com/t5/data-engineering/dq-quality-check-what-are-the-best-method-to-validate-the-two/m-p/83888#M37054</link>
    <description>&lt;P&gt;It does use spark. But of course it is an expensive operation as all records are compared.&lt;BR /&gt;In my experience the performance is reasonable.&lt;/P&gt;</description>
    <pubDate>Thu, 22 Aug 2024 07:12:37 GMT</pubDate>
    <dc:creator>-werners-</dc:creator>
    <dc:date>2024-08-22T07:12:37Z</dc:date>
    <item>
      <title>DQ-Quality Check- what are the best method to validate the two parquet files .</title>
      <link>https://community.databricks.com/t5/data-engineering/dq-quality-check-what-are-the-best-method-to-validate-the-two/m-p/83175#M36861</link>
      <description>&lt;P&gt;DQ-Quality Check. we have to validate the data between landing data and bronze data with data quality . below are the data quality checks.&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;1. find the counts between the 2 files. if it is matched then go for 2 point.&lt;/P&gt;&lt;P&gt;2. if counts are matched, then validate the data row by row as per keys . if keys are matched, then validate the data between the other columns. if the columns are not matched then store in error log file.&lt;/P&gt;&lt;P&gt;what is best methodology we can go for in pyspark(databricks).&lt;/P&gt;</description>
      <pubDate>Fri, 16 Aug 2024 05:56:23 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/dq-quality-check-what-are-the-best-method-to-validate-the-two/m-p/83175#M36861</guid>
      <dc:creator>rameshybr</dc:creator>
      <dc:date>2024-08-16T05:56:23Z</dc:date>
    </item>
    <item>
      <title>Re: DQ-Quality Check- what are the best method to validate the two parquet files .</title>
      <link>https://community.databricks.com/t5/data-engineering/dq-quality-check-what-are-the-best-method-to-validate-the-two/m-p/83807#M37009</link>
      <description>&lt;P&gt;what you are looking for is except and exceptAll.&lt;BR /&gt;f.e. df1.except(df2)&lt;BR /&gt;it returns the data of df1 that has no match in df2.&lt;/P&gt;</description>
      <pubDate>Wed, 21 Aug 2024 13:47:50 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/dq-quality-check-what-are-the-best-method-to-validate-the-two/m-p/83807#M37009</guid>
      <dc:creator>-werners-</dc:creator>
      <dc:date>2024-08-21T13:47:50Z</dc:date>
    </item>
    <item>
      <title>Re: DQ-Quality Check- what are the best method to validate the two parquet files .</title>
      <link>https://community.databricks.com/t5/data-engineering/dq-quality-check-what-are-the-best-method-to-validate-the-two/m-p/83825#M37012</link>
      <description>&lt;P&gt;Thanks Werners. will it provide the good performance?&lt;/P&gt;</description>
      <pubDate>Wed, 21 Aug 2024 15:18:48 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/dq-quality-check-what-are-the-best-method-to-validate-the-two/m-p/83825#M37012</guid>
      <dc:creator>rameshybr</dc:creator>
      <dc:date>2024-08-21T15:18:48Z</dc:date>
    </item>
    <item>
      <title>Re: DQ-Quality Check- what are the best method to validate the two parquet files .</title>
      <link>https://community.databricks.com/t5/data-engineering/dq-quality-check-what-are-the-best-method-to-validate-the-two/m-p/83888#M37054</link>
      <description>&lt;P&gt;It does use spark. But of course it is an expensive operation as all records are compared.&lt;BR /&gt;In my experience the performance is reasonable.&lt;/P&gt;</description>
      <pubDate>Thu, 22 Aug 2024 07:12:37 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/dq-quality-check-what-are-the-best-method-to-validate-the-two/m-p/83888#M37054</guid>
      <dc:creator>-werners-</dc:creator>
      <dc:date>2024-08-22T07:12:37Z</dc:date>
    </item>
    <item>
      <title>Re: DQ-Quality Check- what are the best method to validate the two parquet files .</title>
      <link>https://community.databricks.com/t5/data-engineering/dq-quality-check-what-are-the-best-method-to-validate-the-two/m-p/83891#M37057</link>
      <description>&lt;P&gt;Try with this , this is for second point if first points already matches .&lt;/P&gt;&lt;LI-CODE lang="python"&gt;# Define key columns
key_columns = ["key_column1", "key_column2"]  # Adjust according to your data schema

# Perform an outer join to find mismatches
joined_df = landing_df.alias("landing").join(
    bronze_df.alias("bronze"),
    on=key_columns,
    how="outer"
)&lt;/LI-CODE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Thu, 22 Aug 2024 07:34:00 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/dq-quality-check-what-are-the-best-method-to-validate-the-two/m-p/83891#M37057</guid>
      <dc:creator>Rishabh-Pandey</dc:creator>
      <dc:date>2024-08-22T07:34:00Z</dc:date>
    </item>
  </channel>
</rss>

