<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Re: Running large window spark structured streaming aggregations with small slide duration in Data Engineering</title>
    <link>https://community.databricks.com/t5/data-engineering/running-large-window-spark-structured-streaming-aggregations/m-p/17439#M11460</link>
    <description>&lt;P&gt;HI @Sergey Volkov​,&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Just a friendly follow-up. Are you still looking for help or did any of the docs that Kaniz has shared help you?&lt;/P&gt;</description>
    <pubDate>Fri, 29 Jul 2022 20:13:41 GMT</pubDate>
    <dc:creator>jose_gonzalez</dc:creator>
    <dc:date>2022-07-29T20:13:41Z</dc:date>
    <item>
      <title>Running large window spark structured streaming aggregations with small slide duration</title>
      <link>https://community.databricks.com/t5/data-engineering/running-large-window-spark-structured-streaming-aggregations/m-p/17437#M11458</link>
      <description>&lt;P&gt;I want to run aggregations on large windows (90 days) with small slide duration (5 minutes).&lt;/P&gt;&lt;P&gt;Straightforward solution leads to giant state around hundreds of gigabytes, which doesn't look acceptable.&lt;/P&gt;&lt;P&gt;Is there any best practices doing this?&lt;/P&gt;&lt;P&gt;Now I consider following scenarios:&lt;/P&gt;&lt;OL&gt;&lt;LI&gt;Use flatMapGroupsWithState and implement EWMA (exponentially weighted moving average) instead of average to reduce state. Is there good library for EWMA?&lt;/LI&gt;&lt;LI&gt;Somehow join data from two streams - e.g. 90 day window with 1 day slide and 1 day window with 5 minute slide&lt;/LI&gt;&lt;/OL&gt;&lt;P&gt;Any other ideas?&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;&lt;A href="https://docs.microsoft.com/en-us/answers/questions/884473/running-large-window-aggregations-in-databricks-sp.html" alt="https://docs.microsoft.com/en-us/answers/questions/884473/running-large-window-aggregations-in-databricks-sp.html" target="_blank"&gt;Thread in azure q&amp;amp;a&lt;/A&gt;&lt;/P&gt;</description>
      <pubDate>Fri, 17 Jun 2022 09:25:53 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/running-large-window-spark-structured-streaming-aggregations/m-p/17437#M11458</guid>
      <dc:creator>serg-v</dc:creator>
      <dc:date>2022-06-17T09:25:53Z</dc:date>
    </item>
    <item>
      <title>Re: Running large window spark structured streaming aggregations with small slide duration</title>
      <link>https://community.databricks.com/t5/data-engineering/running-large-window-spark-structured-streaming-aggregations/m-p/17439#M11460</link>
      <description>&lt;P&gt;HI @Sergey Volkov​,&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Just a friendly follow-up. Are you still looking for help or did any of the docs that Kaniz has shared help you?&lt;/P&gt;</description>
      <pubDate>Fri, 29 Jul 2022 20:13:41 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/running-large-window-spark-structured-streaming-aggregations/m-p/17439#M11460</guid>
      <dc:creator>jose_gonzalez</dc:creator>
      <dc:date>2022-07-29T20:13:41Z</dc:date>
    </item>
    <item>
      <title>Re: Running large window spark structured streaming aggregations with small slide duration</title>
      <link>https://community.databricks.com/t5/data-engineering/running-large-window-spark-structured-streaming-aggregations/m-p/17440#M11461</link>
      <description>&lt;P&gt;Hi.&lt;/P&gt;&lt;P&gt;&amp;gt; Are you still looking for help&lt;/P&gt;&lt;P&gt;No, thank you, we have implemented EWMA using flatMapGroupsWithState.&lt;/P&gt;&lt;P&gt;&amp;gt; did any of the docs that Kaniz has shared help you?&lt;/P&gt;&lt;P&gt;Not really. They are just slightly connected to my problem.&lt;/P&gt;</description>
      <pubDate>Sun, 21 Aug 2022 09:51:07 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/running-large-window-spark-structured-streaming-aggregations/m-p/17440#M11461</guid>
      <dc:creator>serg-v</dc:creator>
      <dc:date>2022-08-21T09:51:07Z</dc:date>
    </item>
  </channel>
</rss>

