<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Re: Delta log statistics - timestamp type not working in Data Engineering</title>
    <link>https://community.databricks.com/t5/data-engineering/delta-log-statistics-timestamp-type-not-working/m-p/12730#M7495</link>
    <description>&lt;P&gt;same here&lt;/P&gt;</description>
    <pubDate>Tue, 26 Jul 2022 10:04:53 GMT</pubDate>
    <dc:creator>-werners-</dc:creator>
    <dc:date>2022-07-26T10:04:53Z</dc:date>
    <item>
      <title>Delta log statistics - timestamp type not working</title>
      <link>https://community.databricks.com/t5/data-engineering/delta-log-statistics-timestamp-type-not-working/m-p/12725#M7490</link>
      <description>&lt;P&gt;Hello team!&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;As per the documentation, I understand that the table statistics can be fetched through the delta log (eg min, max, count) in order to not read the underlying data of a delta table.&lt;/P&gt;&lt;P&gt;This is the case for numerical types, and timestamp is supposed to be supported.&lt;/P&gt;&lt;P&gt;In my test example though, when it comes to the timestamp column, the data are fetched as shown by the query plan, instead of reading the delta_log. &lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Any thoughts on how to make spark read the max values of the timestamp through the delta log?&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Delta table creation:&lt;/P&gt;&lt;PRE&gt;&lt;CODE&gt;Seq(
  (1, java.sql.Timestamp.valueOf(java.time.LocalDateTime.now())), 
  (2, java.sql.Timestamp.valueOf(java.time.LocalDateTime.now().minusDays(1))),
  (3, java.sql.Timestamp.valueOf(java.time.LocalDateTime.now().minusDays(2)))
  )
.toDF("value", "ts")
.write
.saveAsTable("test.test_ts")&lt;/CODE&gt;&lt;/PRE&gt;&lt;P&gt;As expected, max for column &lt;B&gt;value &lt;/B&gt; is read by the log:&lt;/P&gt;&lt;P&gt;&lt;span class="lia-inline-image-display-wrapper" image-alt="max value"&gt;&lt;img src="https://community.databricks.com/t5/image/serverpage/image-id/1702iE7385490B351D2C6/image-size/large?v=v2&amp;amp;px=999" role="button" title="max value" alt="max value" /&gt;&lt;/span&gt;but this is not the case for the &lt;B&gt;ts&lt;/B&gt; column:&lt;/P&gt;&lt;P&gt;&lt;span class="lia-inline-image-display-wrapper" image-alt="image.png"&gt;&lt;img src="https://community.databricks.com/t5/image/serverpage/image-id/1696iE29FD41A1984A7C5/image-size/large?v=v2&amp;amp;px=999" role="button" title="image.png" alt="image.png" /&gt;&lt;/span&gt;while the delta log contains the needed information (max of &lt;B&gt;ts&lt;/B&gt; column):&lt;/P&gt;&lt;PRE&gt;&lt;CODE&gt;{"protocol":{"minReaderVersion":1,"minWriterVersion":2}}
{"metaData":{"id":"9f6cbb6f-e866-4279-89a3-609bacf175cb","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"ts\",\"type\":\"timestamp\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{},"createdTime":1658749213255}}
{"add":{"path":"part-00000-0848b536-6c7b-4429-b168-44d53ec48ba4-c000.snappy.parquet","partitionValues":{},"size":852,"modificationTime":1658749217000,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"value\":1,\"ts\":\"2022-07-25T11:40:12.884Z\"},\"maxValues\":{\"value\":1,\"ts\":\"2022-07-25T11:40:12.884Z\"},\"nullCount\":{\"value\":0,\"ts\":0}}","tags":{"INSERTION_TIME":"1658749217000000","OPTIMIZE_TARGET_SIZE":"268435456"}}}
{"add":{"path":"part-00001-0c883bda-bfc3-4921-a46f-32eaa9fb7dbc-c000.snappy.parquet","partitionValues":{},"size":852,"modificationTime":1658749217000,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"value\":2,\"ts\":\"2022-07-24T11:40:12.885Z\"},\"maxValues\":{\"value\":2,\"ts\":\"2022-07-24T11:40:12.885Z\"},\"nullCount\":{\"value\":0,\"ts\":0}}","tags":{"INSERTION_TIME":"1658749217000001","OPTIMIZE_TARGET_SIZE":"268435456"}}}
{"add":{"path":"part-00002-54975e9b-3df8-4382-afa0-2929778ecb0a-c000.snappy.parquet","partitionValues":{},"size":853,"modificationTime":1658749217000,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"value\":3,\"ts\":\"2022-07-23T11:40:12.885Z\"},\"maxValues\":{\"value\":3,\"ts\":\"2022-07-23T11:40:12.885Z\"},\"nullCount\":{\"value\":0,\"ts\":0}}","tags":{"INSERTION_TIME":"1658749217000002","OPTIMIZE_TARGET_SIZE":"268435456"}}}
{"commitInfo":{"timestamp":1658749218051,"userId":"6999390541537531","userName":"p.maroudis@kaizengaming.com","operation":"CREATE TABLE AS SELECT","operationParameters":{"isManaged":"true","description":null,"partitionBy":"[]","properties":"{}"},"notebook":{"notebookId":"2851472110292823"},"clusterId":"0920-080342-who267","isolationLevel":"WriteSerializable","isBlindAppend":true,"operationMetrics":{"numFiles":"3","numOutputRows":"3","numOutputBytes":"2557"},"engineInfo":"Databricks-Runtime/10.4.x-scala2.12","txnId":"775ac353-9df2-45a2-99d2-1a15783c0e40"}}&lt;/CODE&gt;&lt;/PRE&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;&lt;/P&gt;</description>
      <pubDate>Mon, 25 Jul 2022 12:20:26 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/delta-log-statistics-timestamp-type-not-working/m-p/12725#M7490</guid>
      <dc:creator>pantelis_mare</dc:creator>
      <dc:date>2022-07-25T12:20:26Z</dc:date>
    </item>
    <item>
      <title>Re: Delta log statistics - timestamp type not working</title>
      <link>https://community.databricks.com/t5/data-engineering/delta-log-statistics-timestamp-type-not-working/m-p/12726#M7491</link>
      <description>&lt;P&gt;are you sure the timestamp column is a valid spark-timestamp-type?&lt;/P&gt;</description>
      <pubDate>Tue, 26 Jul 2022 07:32:26 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/delta-log-statistics-timestamp-type-not-working/m-p/12726#M7491</guid>
      <dc:creator>-werners-</dc:creator>
      <dc:date>2022-07-26T07:32:26Z</dc:date>
    </item>
    <item>
      <title>Re: Delta log statistics - timestamp type not working</title>
      <link>https://community.databricks.com/t5/data-engineering/delta-log-statistics-timestamp-type-not-working/m-p/12727#M7492</link>
      <description>&lt;P&gt;Yes! as the schema shows.. I also added an explicit cast to TimestampType (even though I create it as a java.sql.Timestamp anyway...&lt;/P&gt;&lt;P&gt;&lt;span class="lia-inline-image-display-wrapper" image-alt="image"&gt;&lt;img src="https://community.databricks.com/t5/image/serverpage/image-id/1683i26C0D512672B8284/image-size/large?v=v2&amp;amp;px=999" role="button" title="image" alt="image" /&gt;&lt;/span&gt;Do  you also get the same behavior? at least in my test case...&lt;/P&gt;</description>
      <pubDate>Tue, 26 Jul 2022 09:39:51 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/delta-log-statistics-timestamp-type-not-working/m-p/12727#M7492</guid>
      <dc:creator>pantelis_mare</dc:creator>
      <dc:date>2022-07-26T09:39:51Z</dc:date>
    </item>
    <item>
      <title>Re: Delta log statistics - timestamp type not working</title>
      <link>https://community.databricks.com/t5/data-engineering/delta-log-statistics-timestamp-type-not-working/m-p/12728#M7493</link>
      <description>&lt;P&gt;Strange.&lt;/P&gt;&lt;P&gt;Have you checked the actual plan (spark ui)?&lt;/P&gt;</description>
      <pubDate>Tue, 26 Jul 2022 09:47:22 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/delta-log-statistics-timestamp-type-not-working/m-p/12728#M7493</guid>
      <dc:creator>-werners-</dc:creator>
      <dc:date>2022-07-26T09:47:22Z</dc:date>
    </item>
    <item>
      <title>Re: Delta log statistics - timestamp type not working</title>
      <link>https://community.databricks.com/t5/data-engineering/delta-log-statistics-timestamp-type-not-working/m-p/12729#M7494</link>
      <description>&lt;P&gt;Yep.. attached it in my question.. Plus it's obvious by the number of tasks it is spawning.. it reads the whole table (more obvious in big tables with more read partitions).&lt;/P&gt;&lt;P&gt;Can you reproduce it on your side as well or is it just us?&lt;/P&gt;&lt;P&gt;&lt;/P&gt;</description>
      <pubDate>Tue, 26 Jul 2022 09:54:46 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/delta-log-statistics-timestamp-type-not-working/m-p/12729#M7494</guid>
      <dc:creator>pantelis_mare</dc:creator>
      <dc:date>2022-07-26T09:54:46Z</dc:date>
    </item>
    <item>
      <title>Re: Delta log statistics - timestamp type not working</title>
      <link>https://community.databricks.com/t5/data-engineering/delta-log-statistics-timestamp-type-not-working/m-p/12730#M7495</link>
      <description>&lt;P&gt;same here&lt;/P&gt;</description>
      <pubDate>Tue, 26 Jul 2022 10:04:53 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/delta-log-statistics-timestamp-type-not-working/m-p/12730#M7495</guid>
      <dc:creator>-werners-</dc:creator>
      <dc:date>2022-07-26T10:04:53Z</dc:date>
    </item>
    <item>
      <title>Re: Delta log statistics - timestamp type not working</title>
      <link>https://community.databricks.com/t5/data-engineering/delta-log-statistics-timestamp-type-not-working/m-p/12731#M7496</link>
      <description>&lt;P&gt;After communication with our CSE in Databricks, apparently there was an issue with truncation and precision loss that made them disactivate this feature..&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Actually I also had this issue on another project when trying to parse timestamp fields using json4s. It is using the SimpleDateFormat class to parse timestamps that does not support microsecond precision.. maybe this is related..&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;@Werner Stinckens​&amp;nbsp;thanks again for your help!&lt;/P&gt;</description>
      <pubDate>Tue, 26 Jul 2022 10:09:49 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/delta-log-statistics-timestamp-type-not-working/m-p/12731#M7496</guid>
      <dc:creator>pantelis_mare</dc:creator>
      <dc:date>2022-07-26T10:09:49Z</dc:date>
    </item>
  </channel>
</rss>

