<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Re: Filtering files for query in Data Engineering</title>
    <link>https://community.databricks.com/t5/data-engineering/filtering-files-for-query/m-p/53585#M29828</link>
    <description>&lt;P&gt;Hi Kaniz&amp;nbsp;&lt;BR /&gt;I have one more issue , i am writing less than 1.2k records to the datalake table (append mode). While writing it is showing "determining dbio file fragments this would take some time', when i checked the log i see GC allocation failure&amp;nbsp; .&lt;BR /&gt;and my overall execution time is 20 mins which is hard for me , how can i resolve this , ? is it mandatory to use Vaccum, Analyze queries along with Optimize&lt;BR /&gt;shall i run optimize datalake.table ?&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&lt;BR /&gt;&lt;BR /&gt;&lt;/P&gt;</description>
    <pubDate>Thu, 23 Nov 2023 07:37:34 GMT</pubDate>
    <dc:creator>Muhammed</dc:creator>
    <dc:date>2023-11-23T07:37:34Z</dc:date>
    <item>
      <title>Filtering files for query</title>
      <link>https://community.databricks.com/t5/data-engineering/filtering-files-for-query/m-p/53302#M29773</link>
      <description>&lt;P&gt;Hi Team,&lt;/P&gt;&lt;P&gt;While writing my data to datalake table I am getting 'filtering files for query', it would be stuck at writing&lt;/P&gt;&lt;P&gt;How can I resolve this issue&lt;/P&gt;</description>
      <pubDate>Tue, 21 Nov 2023 14:13:24 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/filtering-files-for-query/m-p/53302#M29773</guid>
      <dc:creator>Muhammed</dc:creator>
      <dc:date>2023-11-21T14:13:24Z</dc:date>
    </item>
    <item>
      <title>Re: Filtering files for query</title>
      <link>https://community.databricks.com/t5/data-engineering/filtering-files-for-query/m-p/53305#M29775</link>
      <description>&lt;P&gt;Can you give some more details? Are doing merge statements? How big are the tables?&lt;/P&gt;&lt;P&gt;For merge statements i.e. the process needs to read the target table to analyze which parquet files need to be rewritten. If you don't have proper partitioning or z-index, it could end up scanning all files even you only try to update a few rows.&amp;nbsp;&lt;/P&gt;&lt;P&gt;Did you try to optimize the tables already?&lt;/P&gt;</description>
      <pubDate>Tue, 21 Nov 2023 14:51:40 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/filtering-files-for-query/m-p/53305#M29775</guid>
      <dc:creator>pgruetter</dc:creator>
      <dc:date>2023-11-21T14:51:40Z</dc:date>
    </item>
    <item>
      <title>Re: Filtering files for query</title>
      <link>https://community.databricks.com/t5/data-engineering/filtering-files-for-query/m-p/53308#M29778</link>
      <description>&lt;P&gt;Thanks for quick reply,&lt;/P&gt;&lt;P&gt;I am using SSMS as my redacted table and it is using upsert as write mode, that table huge in size when I checked the SQL part in&amp;nbsp;&lt;/P&gt;&lt;P&gt;Databricks ,it is reading every records to memory&amp;nbsp;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Tue, 21 Nov 2023 15:47:36 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/filtering-files-for-query/m-p/53308#M29778</guid>
      <dc:creator>Muhammed</dc:creator>
      <dc:date>2023-11-21T15:47:36Z</dc:date>
    </item>
    <item>
      <title>Re: Filtering files for query</title>
      <link>https://community.databricks.com/t5/data-engineering/filtering-files-for-query/m-p/53310#M29780</link>
      <description />
      <pubDate>Tue, 21 Nov 2023 16:01:43 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/filtering-files-for-query/m-p/53310#M29780</guid>
      <dc:creator>Muhammed</dc:creator>
      <dc:date>2023-11-21T16:01:43Z</dc:date>
    </item>
    <item>
      <title>Re: Filtering files for query</title>
      <link>https://community.databricks.com/t5/data-engineering/filtering-files-for-query/m-p/53357#M29784</link>
      <description>&lt;P&gt;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/20230"&gt;@pgruetter&lt;/a&gt;&amp;nbsp;&lt;BR /&gt;could you please check above?&lt;/P&gt;</description>
      <pubDate>Tue, 21 Nov 2023 18:13:20 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/filtering-files-for-query/m-p/53357#M29784</guid>
      <dc:creator>Muhammed</dc:creator>
      <dc:date>2023-11-21T18:13:20Z</dc:date>
    </item>
    <item>
      <title>Re: Filtering files for query</title>
      <link>https://community.databricks.com/t5/data-engineering/filtering-files-for-query/m-p/53358#M29785</link>
      <description>&lt;P&gt;Still hard to say, but it sounds like my assumption is correct. Because your upsert doesn't which records to update, it needs to scan everything. Make sure that it's properly partitioned, you have a z-index and execute an optimize table.&lt;/P&gt;</description>
      <pubDate>Tue, 21 Nov 2023 18:33:26 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/filtering-files-for-query/m-p/53358#M29785</guid>
      <dc:creator>pgruetter</dc:creator>
      <dc:date>2023-11-21T18:33:26Z</dc:date>
    </item>
    <item>
      <title>Re: Filtering files for query</title>
      <link>https://community.databricks.com/t5/data-engineering/filtering-files-for-query/m-p/53585#M29828</link>
      <description>&lt;P&gt;Hi Kaniz&amp;nbsp;&lt;BR /&gt;I have one more issue , i am writing less than 1.2k records to the datalake table (append mode). While writing it is showing "determining dbio file fragments this would take some time', when i checked the log i see GC allocation failure&amp;nbsp; .&lt;BR /&gt;and my overall execution time is 20 mins which is hard for me , how can i resolve this , ? is it mandatory to use Vaccum, Analyze queries along with Optimize&lt;BR /&gt;shall i run optimize datalake.table ?&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&lt;BR /&gt;&lt;BR /&gt;&lt;/P&gt;</description>
      <pubDate>Thu, 23 Nov 2023 07:37:34 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/filtering-files-for-query/m-p/53585#M29828</guid>
      <dc:creator>Muhammed</dc:creator>
      <dc:date>2023-11-23T07:37:34Z</dc:date>
    </item>
    <item>
      <title>Re: Filtering files for query</title>
      <link>https://community.databricks.com/t5/data-engineering/filtering-files-for-query/m-p/53586#M29829</link>
      <description />
      <pubDate>Thu, 23 Nov 2023 07:39:43 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/filtering-files-for-query/m-p/53586#M29829</guid>
      <dc:creator>Muhammed</dc:creator>
      <dc:date>2023-11-23T07:39:43Z</dc:date>
    </item>
    <item>
      <title>Re: Filtering files for query</title>
      <link>https://community.databricks.com/t5/data-engineering/filtering-files-for-query/m-p/54979#M30208</link>
      <description>&lt;P&gt;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/9"&gt;@Retired_mod&lt;/a&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;We are using framework for data ingestion, hope this will not make any issues to the metadata of the datalake table ?, as per the framework metadata of the table is crucial , any changes happened to it will effect the system .&lt;/P&gt;&lt;P&gt;Some times the particular pipeline would take 2 hrs for just writing 1k records.&lt;/P&gt;</description>
      <pubDate>Sat, 09 Dec 2023 16:09:32 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/filtering-files-for-query/m-p/54979#M30208</guid>
      <dc:creator>Muhammed</dc:creator>
      <dc:date>2023-12-09T16:09:32Z</dc:date>
    </item>
    <item>
      <title>Re: Filtering files for query</title>
      <link>https://community.databricks.com/t5/data-engineering/filtering-files-for-query/m-p/55071#M30237</link>
      <description>&lt;P&gt;Hi &lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/9"&gt;@Retired_mod&lt;/a&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;Any info on this ?&lt;/P&gt;</description>
      <pubDate>Mon, 11 Dec 2023 17:38:13 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/filtering-files-for-query/m-p/55071#M30237</guid>
      <dc:creator>Muhammed</dc:creator>
      <dc:date>2023-12-11T17:38:13Z</dc:date>
    </item>
    <item>
      <title>Re: Filtering files for query</title>
      <link>https://community.databricks.com/t5/data-engineering/filtering-files-for-query/m-p/55179#M30260</link>
      <description>&lt;P&gt;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/94535"&gt;@Muhammed&lt;/a&gt;&amp;nbsp; describe &amp;lt;table_name&amp;gt; will give you idea about how your table is partitioned. Consider adding partition column condition in where clause for better performance.&lt;/P&gt;</description>
      <pubDate>Wed, 13 Dec 2023 02:23:13 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/filtering-files-for-query/m-p/55179#M30260</guid>
      <dc:creator>kulkpd</dc:creator>
      <dc:date>2023-12-13T02:23:13Z</dc:date>
    </item>
    <item>
      <title>Re: Filtering files for query</title>
      <link>https://community.databricks.com/t5/data-engineering/filtering-files-for-query/m-p/55180#M30261</link>
      <description>&lt;P&gt;I understand you are getting '&lt;SPAN&gt;filtering files for query' while writing.&lt;/SPAN&gt;&lt;/P&gt;&lt;P&gt;&lt;SPAN&gt;From screenshot it looks like you have 157 million files in source location. can you please try dividing the files per by prefix so that small microbatches can be processed in parallel.&lt;BR /&gt;&lt;BR /&gt;Try to use maxFilesPertrigger option so restrict files per batch.&lt;/SPAN&gt;&lt;/P&gt;</description>
      <pubDate>Wed, 13 Dec 2023 02:33:10 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/filtering-files-for-query/m-p/55180#M30261</guid>
      <dc:creator>kulkpd</dc:creator>
      <dc:date>2023-12-13T02:33:10Z</dc:date>
    </item>
    <item>
      <title>Re: Filtering files for query</title>
      <link>https://community.databricks.com/t5/data-engineering/filtering-files-for-query/m-p/55239#M30277</link>
      <description>&lt;P&gt;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/90201"&gt;@kulkpd&lt;/a&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;Where did you get the&amp;nbsp; info related to 157 million files ? If possible could you pls explain it&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Thu, 14 Dec 2023 12:26:28 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/filtering-files-for-query/m-p/55239#M30277</guid>
      <dc:creator>Muhammed</dc:creator>
      <dc:date>2023-12-14T12:26:28Z</dc:date>
    </item>
    <item>
      <title>Re: Filtering files for query</title>
      <link>https://community.databricks.com/t5/data-engineering/filtering-files-for-query/m-p/55272#M30284</link>
      <description>&lt;P&gt;My bad, somewhere in the screenshot I saw that but not able to find it now.&lt;BR /&gt;Which source you are using to load the data, delta table, aws-s3, or azure-storage?&lt;/P&gt;</description>
      <pubDate>Thu, 14 Dec 2023 18:31:14 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/filtering-files-for-query/m-p/55272#M30284</guid>
      <dc:creator>kulkpd</dc:creator>
      <dc:date>2023-12-14T18:31:14Z</dc:date>
    </item>
  </channel>
</rss>

