<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Re: Some records are missing after window function in Data Engineering</title>
    <link>https://community.databricks.com/t5/data-engineering/some-records-are-missing-after-window-function/m-p/109310#M43271</link>
    <description>&lt;P&gt;Thanks Madhu! Will try this.&lt;/P&gt;</description>
    <pubDate>Thu, 06 Feb 2025 21:42:04 GMT</pubDate>
    <dc:creator>asurendran</dc:creator>
    <dc:date>2025-02-06T21:42:04Z</dc:date>
    <item>
      <title>Some records are missing after window function</title>
      <link>https://community.databricks.com/t5/data-engineering/some-records-are-missing-after-window-function/m-p/109275#M43263</link>
      <description>&lt;P&gt;While loading data from one layer to another layer using pyspark window function, I noticed that some data is missing. This is happening if the data is huge. It's not happening for small quantity. Does anyone come across this issue before?&lt;/P&gt;</description>
      <pubDate>Thu, 06 Feb 2025 18:34:15 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/some-records-are-missing-after-window-function/m-p/109275#M43263</guid>
      <dc:creator>asurendran</dc:creator>
      <dc:date>2025-02-06T18:34:15Z</dc:date>
    </item>
    <item>
      <title>Re: Some records are missing after window function</title>
      <link>https://community.databricks.com/t5/data-engineering/some-records-are-missing-after-window-function/m-p/109284#M43264</link>
      <description>&lt;P&gt;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/96713"&gt;@asurendran&lt;/a&gt;&amp;nbsp;Missing data with PySpark window functions on large datasets often stems from incorrect data partitioning (leading to incomplete window calculations) and/or data skew (causing executor overload or failures). Memory limitations and network issues can also contribute.&amp;nbsp;&lt;BR /&gt;&lt;BR /&gt;Can you elaborate little more?&lt;/P&gt;</description>
      <pubDate>Thu, 06 Feb 2025 19:30:07 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/some-records-are-missing-after-window-function/m-p/109284#M43264</guid>
      <dc:creator>MadhuB</dc:creator>
      <dc:date>2025-02-06T19:30:07Z</dc:date>
    </item>
    <item>
      <title>Re: Some records are missing after window function</title>
      <link>https://community.databricks.com/t5/data-engineering/some-records-are-missing-after-window-function/m-p/109294#M43265</link>
      <description>&lt;P&gt;I have a dataframe with key, eff date, end date... I want to use a window function with lag option to populate previous end date... I am partitioning by the key and order by the effective date. But I am seeing count diference.&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Thu, 06 Feb 2025 20:21:22 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/some-records-are-missing-after-window-function/m-p/109294#M43265</guid>
      <dc:creator>asurendran</dc:creator>
      <dc:date>2025-02-06T20:21:22Z</dc:date>
    </item>
    <item>
      <title>Re: Some records are missing after window function</title>
      <link>https://community.databricks.com/t5/data-engineering/some-records-are-missing-after-window-function/m-p/109307#M43268</link>
      <description>&lt;P&gt;Is there a way caching the dataframe helps to fix this issue?&lt;/P&gt;</description>
      <pubDate>Thu, 06 Feb 2025 21:21:11 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/some-records-are-missing-after-window-function/m-p/109307#M43268</guid>
      <dc:creator>asurendran</dc:creator>
      <dc:date>2025-02-06T21:21:11Z</dc:date>
    </item>
    <item>
      <title>Re: Some records are missing after window function</title>
      <link>https://community.databricks.com/t5/data-engineering/some-records-are-missing-after-window-function/m-p/109308#M43269</link>
      <description>&lt;P&gt;Before applying the window function, try repartitioning your DataFrame based on the key (or the salted key). This can help distribute the data more evenly across the executors.&lt;/P&gt;&lt;LI-CODE lang="markup"&gt;from pyspark.sql import Window
from pyspark.sql.functions import lag

# Repartition DataFrame
df = df.repartition("key")

# Define window specification
window_spec = Window.partitionBy("key").orderBy("eff_date")

# Add previous end date
df = df.withColumn("prev_end_date", lag("end_date", 1).over(window_spec))

# Show the result
df.show()&lt;/LI-CODE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Thu, 06 Feb 2025 21:21:58 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/some-records-are-missing-after-window-function/m-p/109308#M43269</guid>
      <dc:creator>MadhuB</dc:creator>
      <dc:date>2025-02-06T21:21:58Z</dc:date>
    </item>
    <item>
      <title>Re: Some records are missing after window function</title>
      <link>https://community.databricks.com/t5/data-engineering/some-records-are-missing-after-window-function/m-p/109309#M43270</link>
      <description>&lt;P&gt;Caching is for performance optimization but it may not work, if&amp;nbsp;there is a problem lies in the logic of your window function, data skew, or data inconsistencies.&amp;nbsp;&lt;/P&gt;&lt;P&gt;I would recommend to try with a memory optimized cluster to see how it goes.&lt;/P&gt;</description>
      <pubDate>Thu, 06 Feb 2025 21:28:46 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/some-records-are-missing-after-window-function/m-p/109309#M43270</guid>
      <dc:creator>MadhuB</dc:creator>
      <dc:date>2025-02-06T21:28:46Z</dc:date>
    </item>
    <item>
      <title>Re: Some records are missing after window function</title>
      <link>https://community.databricks.com/t5/data-engineering/some-records-are-missing-after-window-function/m-p/109310#M43271</link>
      <description>&lt;P&gt;Thanks Madhu! Will try this.&lt;/P&gt;</description>
      <pubDate>Thu, 06 Feb 2025 21:42:04 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/some-records-are-missing-after-window-function/m-p/109310#M43271</guid>
      <dc:creator>asurendran</dc:creator>
      <dc:date>2025-02-06T21:42:04Z</dc:date>
    </item>
    <item>
      <title>Re: Some records are missing after window function</title>
      <link>https://community.databricks.com/t5/data-engineering/some-records-are-missing-after-window-function/m-p/109456#M43325</link>
      <description>&lt;P&gt;I tried repartitioning and renaming dataframe name for each transformation. Still it's showing missing records. Please let me know if you have any other suggestion.&lt;/P&gt;</description>
      <pubDate>Fri, 07 Feb 2025 23:26:24 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/some-records-are-missing-after-window-function/m-p/109456#M43325</guid>
      <dc:creator>asurendran</dc:creator>
      <dc:date>2025-02-07T23:26:24Z</dc:date>
    </item>
  </channel>
</rss>

