<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Re: Data getting missed while reading from azure event hub using spark streaming in Data Engineering</title>
    <link>https://community.databricks.com/t5/data-engineering/data-getting-missed-while-reading-from-azure-event-hub-using/m-p/39031#M26854</link>
    <description>&lt;P&gt;Ideally you can still read the event hub events, and see if the missing ones are there.&amp;nbsp; If so: it must be spark related.&lt;BR /&gt;If they are already gone, hard to tell.&amp;nbsp; I'd store the events in a raw table so you can do validity checks.}&lt;BR /&gt;&lt;BR /&gt;&lt;/P&gt;</description>
    <pubDate>Thu, 03 Aug 2023 12:05:33 GMT</pubDate>
    <dc:creator>-werners-</dc:creator>
    <dc:date>2023-08-03T12:05:33Z</dc:date>
    <item>
      <title>Data getting missed while reading from azure event hub using spark streaming</title>
      <link>https://community.databricks.com/t5/data-engineering/data-getting-missed-while-reading-from-azure-event-hub-using/m-p/39009#M26844</link>
      <description>&lt;P&gt;Hi All,&lt;/P&gt;&lt;P&gt;I am facing an issue of data getting missed.&lt;/P&gt;&lt;P&gt;I am reading the data from azure event hub and after flattening the json data I am storing it in a parquet file and then using another databricks notebook to perform the merge operations on my delta table by adding some etl columns to it.&lt;/P&gt;&lt;P&gt;However in between somewhere the records are getting missed.&lt;/P&gt;&lt;P&gt;I have scheduled the job to run every hour.&lt;/P&gt;&lt;P&gt;Can someone please help me out with this.&lt;/P&gt;</description>
      <pubDate>Thu, 03 Aug 2023 11:15:25 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/data-getting-missed-while-reading-from-azure-event-hub-using/m-p/39009#M26844</guid>
      <dc:creator>Rishi045</dc:creator>
      <dc:date>2023-08-03T11:15:25Z</dc:date>
    </item>
    <item>
      <title>Re: Data getting missed while reading from azure event hub using spark streaming</title>
      <link>https://community.databricks.com/t5/data-engineering/data-getting-missed-while-reading-from-azure-event-hub-using/m-p/39012#M26845</link>
      <description>&lt;P&gt;do you land the event hub data unprocessed on a data lake?&amp;nbsp; if so, you can check if everything is there.&lt;BR /&gt;If so: check the next step and so on.&lt;BR /&gt;If you do not save the raw data, with some luck you have it still in event hub.&lt;/P&gt;</description>
      <pubDate>Thu, 03 Aug 2023 11:27:52 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/data-getting-missed-while-reading-from-azure-event-hub-using/m-p/39012#M26845</guid>
      <dc:creator>-werners-</dc:creator>
      <dc:date>2023-08-03T11:27:52Z</dc:date>
    </item>
    <item>
      <title>Re: Data getting missed while reading from azure event hub using spark streaming</title>
      <link>https://community.databricks.com/t5/data-engineering/data-getting-missed-while-reading-from-azure-event-hub-using/m-p/39014#M26847</link>
      <description>&lt;P&gt;No the data is processed before landing on a datalake&lt;/P&gt;</description>
      <pubDate>Thu, 03 Aug 2023 11:36:53 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/data-getting-missed-while-reading-from-azure-event-hub-using/m-p/39014#M26847</guid>
      <dc:creator>Rishi045</dc:creator>
      <dc:date>2023-08-03T11:36:53Z</dc:date>
    </item>
    <item>
      <title>Re: Data getting missed while reading from azure event hub using spark streaming</title>
      <link>https://community.databricks.com/t5/data-engineering/data-getting-missed-while-reading-from-azure-event-hub-using/m-p/39015#M26848</link>
      <description>&lt;P&gt;I could not find those missing records in data lake as well&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Thu, 03 Aug 2023 11:38:11 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/data-getting-missed-while-reading-from-azure-event-hub-using/m-p/39015#M26848</guid>
      <dc:creator>Rishi045</dc:creator>
      <dc:date>2023-08-03T11:38:11Z</dc:date>
    </item>
    <item>
      <title>Re: Data getting missed while reading from azure event hub using spark streaming</title>
      <link>https://community.databricks.com/t5/data-engineering/data-getting-missed-while-reading-from-azure-event-hub-using/m-p/39017#M26850</link>
      <description>&lt;P&gt;Using below code :&lt;/P&gt;&lt;DIV&gt;conf = {}&lt;/DIV&gt;&lt;DIV&gt;df = spark.readStream.format("eventhubs").options(**conf).load()&lt;/DIV&gt;&lt;DIV&gt;dataDF = df.select(col("body").cast("STRING"))&lt;/DIV&gt;&lt;DIV&gt;data = dataDF.select(json_tuple(col("body"),"table","op_type","records","op_ts")) \&lt;/DIV&gt;&lt;DIV&gt;&amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; .toDF("table","op_type","records","op_ts")&lt;/DIV&gt;&lt;DIV&gt;&amp;nbsp;&lt;/DIV&gt;&lt;DIV&gt;final_data = data.withColumn("records_json",from_json(col("records"),reqSchema))&lt;/DIV&gt;&lt;DIV&gt;final_data = final_data.select(&lt;/DIV&gt;&lt;DIV&gt;&amp;nbsp; &amp;nbsp; *[col("records_json." + field).alias(field) for field in reqSchema.fieldNames()],&lt;/DIV&gt;&lt;DIV&gt;&amp;nbsp; &amp;nbsp; col("op_type"),&lt;/DIV&gt;&lt;DIV&gt;&amp;nbsp; &amp;nbsp; col("op_ts"))&lt;/DIV&gt;&lt;DIV&gt;final_data.orderBy(col("op_ts").desc())&lt;/DIV&gt;&lt;DIV&gt;final_data = final_data.dropDuplicates([primaryKey])&amp;nbsp;&amp;nbsp;&lt;/DIV&gt;&lt;DIV&gt;final_data = final_data.distinct()&lt;/DIV&gt;&lt;DIV&gt;final_data = final_data.drop(final_data.op_ts)&lt;/DIV&gt;&lt;DIV&gt;final_data = final_data.drop(final_data.op_type)&lt;/DIV&gt;&lt;DIV&gt;&amp;nbsp;&lt;/DIV&gt;&lt;DIV&gt;final_data.coalesce(1).writeStream \&lt;/DIV&gt;&lt;DIV&gt;&amp;nbsp; &amp;nbsp; .format("parquet") \&lt;/DIV&gt;&lt;DIV&gt;&amp;nbsp; &amp;nbsp; .outputMode("append") \&lt;/DIV&gt;&lt;DIV&gt;&amp;nbsp; &amp;nbsp; .option("checkpointLocation",checkPoint_url) \&lt;/DIV&gt;&lt;DIV&gt;&amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; .trigger(once=True)\&lt;/DIV&gt;&lt;DIV&gt;&amp;nbsp; &amp;nbsp; .start(rawFilePath_url) \&lt;/DIV&gt;&lt;DIV&gt;&amp;nbsp; &amp;nbsp; .awaitTermination()&lt;/DIV&gt;</description>
      <pubDate>Thu, 03 Aug 2023 11:43:21 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/data-getting-missed-while-reading-from-azure-event-hub-using/m-p/39017#M26850</guid>
      <dc:creator>Rishi045</dc:creator>
      <dc:date>2023-08-03T11:43:21Z</dc:date>
    </item>
    <item>
      <title>Re: Data getting missed while reading from azure event hub using spark streaming</title>
      <link>https://community.databricks.com/t5/data-engineering/data-getting-missed-while-reading-from-azure-event-hub-using/m-p/39020#M26851</link>
      <description>&lt;P&gt;without having the actual raw data, it is hard to figure out where the issue resides.&amp;nbsp; It could be the code or it could be event hub.&lt;BR /&gt;I'd store the raw event hub data in a data lake and use autoloader for further processing.&lt;/P&gt;&lt;P&gt;One thing in your code that caught my attention is the use of dropduplicates and distinct.&lt;BR /&gt;Are you sure you are not dropping too much?&lt;/P&gt;</description>
      <pubDate>Thu, 03 Aug 2023 11:48:05 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/data-getting-missed-while-reading-from-azure-event-hub-using/m-p/39020#M26851</guid>
      <dc:creator>-werners-</dc:creator>
      <dc:date>2023-08-03T11:48:05Z</dc:date>
    </item>
    <item>
      <title>Re: Data getting missed while reading from azure event hub using spark streaming</title>
      <link>https://community.databricks.com/t5/data-engineering/data-getting-missed-while-reading-from-azure-event-hub-using/m-p/39024#M26852</link>
      <description>&lt;P&gt;I am dropping duplicates based on my primary key as multiple records are coming from event hub for a single primary key and I want to take the latest one also i am doing order by on op_ts column so that I get the latest record only.&lt;/P&gt;&lt;P&gt;Distinct can create any issue in the code I am not sure.&lt;/P&gt;</description>
      <pubDate>Thu, 03 Aug 2023 11:54:02 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/data-getting-missed-while-reading-from-azure-event-hub-using/m-p/39024#M26852</guid>
      <dc:creator>Rishi045</dc:creator>
      <dc:date>2023-08-03T11:54:02Z</dc:date>
    </item>
    <item>
      <title>Re: Data getting missed while reading from azure event hub using spark streaming</title>
      <link>https://community.databricks.com/t5/data-engineering/data-getting-missed-while-reading-from-azure-event-hub-using/m-p/39030#M26853</link>
      <description>&lt;P&gt;- In the EventHub, you can preview the event hub job using Azure Analitycs, so please first check are all records there&lt;/P&gt;&lt;P&gt;- Please set in Databricks that it is saved directly to the bronze delta table without performing any aggregation, just 1 to 1, and check if all records are there.&lt;/P&gt;&lt;P&gt;- Please consider using Delta Live Table for ingestion from Event Hub. It will make your live easier regarding monitoring stream, data quality, and performing full refresh when needed.&lt;/P&gt;</description>
      <pubDate>Thu, 03 Aug 2023 12:00:07 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/data-getting-missed-while-reading-from-azure-event-hub-using/m-p/39030#M26853</guid>
      <dc:creator>Hubert-Dudek</dc:creator>
      <dc:date>2023-08-03T12:00:07Z</dc:date>
    </item>
    <item>
      <title>Re: Data getting missed while reading from azure event hub using spark streaming</title>
      <link>https://community.databricks.com/t5/data-engineering/data-getting-missed-while-reading-from-azure-event-hub-using/m-p/39031#M26854</link>
      <description>&lt;P&gt;Ideally you can still read the event hub events, and see if the missing ones are there.&amp;nbsp; If so: it must be spark related.&lt;BR /&gt;If they are already gone, hard to tell.&amp;nbsp; I'd store the events in a raw table so you can do validity checks.}&lt;BR /&gt;&lt;BR /&gt;&lt;/P&gt;</description>
      <pubDate>Thu, 03 Aug 2023 12:05:33 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/data-getting-missed-while-reading-from-azure-event-hub-using/m-p/39031#M26854</guid>
      <dc:creator>-werners-</dc:creator>
      <dc:date>2023-08-03T12:05:33Z</dc:date>
    </item>
    <item>
      <title>Re: Data getting missed while reading from azure event hub using spark streaming</title>
      <link>https://community.databricks.com/t5/data-engineering/data-getting-missed-while-reading-from-azure-event-hub-using/m-p/39032#M26855</link>
      <description>&lt;P&gt;in event hub i can see the missing records. should I be using forEachBatch somewhere in my code. ?or any other suggestion.&lt;/P&gt;</description>
      <pubDate>Thu, 03 Aug 2023 12:09:20 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/data-getting-missed-while-reading-from-azure-event-hub-using/m-p/39032#M26855</guid>
      <dc:creator>Rishi045</dc:creator>
      <dc:date>2023-08-03T12:09:20Z</dc:date>
    </item>
    <item>
      <title>Re: Data getting missed while reading from azure event hub using spark streaming</title>
      <link>https://community.databricks.com/t5/data-engineering/data-getting-missed-while-reading-from-azure-event-hub-using/m-p/39034#M26856</link>
      <description>&lt;P&gt;Basically you want to do some cleanup of duplicates. And you want to do this for each microbatch coming in I assume.&lt;BR /&gt;So indeed, create a function which does de dedup etc and call it in foreachbatch.&lt;BR /&gt;&lt;A href="https://docs.databricks.com/en/delta/merge.html#data-deduplication-when-writing-into-delta-tables" target="_blank"&gt;https://docs.databricks.com/en/delta/merge.html#data-deduplication-when-writing-into-delta-tables&lt;/A&gt;&lt;/P&gt;</description>
      <pubDate>Thu, 03 Aug 2023 12:22:08 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/data-getting-missed-while-reading-from-azure-event-hub-using/m-p/39034#M26856</guid>
      <dc:creator>-werners-</dc:creator>
      <dc:date>2023-08-03T12:22:08Z</dc:date>
    </item>
    <item>
      <title>Re: Data getting missed while reading from azure event hub using spark streaming</title>
      <link>https://community.databricks.com/t5/data-engineering/data-getting-missed-while-reading-from-azure-event-hub-using/m-p/39036#M26857</link>
      <description>&lt;P&gt;As of now I am not having any foreachbatch in my code.&amp;nbsp;&lt;/P&gt;&lt;P&gt;I am performing dedup on entire data coming from event hub&lt;/P&gt;</description>
      <pubDate>Thu, 03 Aug 2023 12:35:01 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/data-getting-missed-while-reading-from-azure-event-hub-using/m-p/39036#M26857</guid>
      <dc:creator>Rishi045</dc:creator>
      <dc:date>2023-08-03T12:35:01Z</dc:date>
    </item>
  </channel>
</rss>

