<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Re: Extract datetime value from the file name in Data Engineering</title>
    <link>https://community.databricks.com/t5/data-engineering/extract-datetime-value-from-the-file-name/m-p/103144#M41348</link>
    <description>&lt;P&gt;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/110498"&gt;@David_Billa&lt;/a&gt;&amp;nbsp;how about&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;LI-CODE lang="python"&gt;import pyspark.sql.functions as F

df_with_datetime = df.withColumn(
    'extracted_datetime',
    F.to_timestamp(
        F.concat(
            *[F.split_part(F.col('file_name'), F.lit("_"), F.lit(i)) for i in range(-6, -1)]
        ),
        'yyyyMMdd\'T\'HHmmss'
    )
)

display(df_with_datetime)&lt;/LI-CODE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;or&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;LI-CODE lang="python"&gt;SELECT *,
       to_timestamp(
           concat_ws('', 
               split_part(file_name, '_', -6),
               split_part(file_name, '_', -5),
               split_part(file_name, '_', -4),
               split_part(file_name, '_', -3),
               split_part(file_name, '_', -2)
           ),
           'yyyyMMdd\'T\'HHmmss'
       ) AS extracted_datetime
FROM df&lt;/LI-CODE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
    <pubDate>Tue, 24 Dec 2024 14:39:27 GMT</pubDate>
    <dc:creator>ck1</dc:creator>
    <dc:date>2024-12-24T14:39:27Z</dc:date>
    <item>
      <title>Extract datetime value from the file name</title>
      <link>https://community.databricks.com/t5/data-engineering/extract-datetime-value-from-the-file-name/m-p/103117#M41334</link>
      <description>&lt;P&gt;I've the filename as below and I want to extract the datetime values and convert to datetime data type.&amp;nbsp;&lt;/P&gt;&lt;P&gt;&lt;STRONG&gt;This_is_new_file_2024_12_06T11_00_49_AM.csv&lt;/STRONG&gt;&lt;/P&gt;&lt;P&gt;Here I want to extract only '&lt;STRONG&gt;2024_12_06T11_00_49' &lt;/STRONG&gt;and convert to datetime value in new field. I tried Substr with&amp;nbsp;yyyyMMddHHmmss and it's not working. It's producing only null values instead of the datetime value&lt;STRONG&gt;.&lt;/STRONG&gt;&lt;/P&gt;&lt;P&gt;Any help?&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Tue, 24 Dec 2024 10:47:25 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/extract-datetime-value-from-the-file-name/m-p/103117#M41334</guid>
      <dc:creator>David_Billa</dc:creator>
      <dc:date>2024-12-24T10:47:25Z</dc:date>
    </item>
    <item>
      <title>Re: Extract datetime value from the file name</title>
      <link>https://community.databricks.com/t5/data-engineering/extract-datetime-value-from-the-file-name/m-p/103132#M41340</link>
      <description>&lt;P&gt;Hello David, you can use something like:&lt;BR /&gt;&lt;BR /&gt;&lt;/P&gt;
&lt;LI-CODE lang="markup"&gt;from pyspark.sql import SparkSession
from pyspark.sql.functions import regexp_extract, to_timestamp

# Initialize Spark session
spark = SparkSession.builder.appName("ExtractDatetime").getOrCreate()

# Sample data
data = [("This_is_new_file_2024_12_06T11_00_49_AM.csv",)]
df = spark.createDataFrame(data, ["filename"])

# Extract the datetime string using regexp_extract
datetime_pattern = r"(\d{4}_\d{2}_\d{2}T\d{2}_\d{2}_\d{2})"
df = df.withColumn("datetime_str", regexp_extract("filename", datetime_pattern, 1))

# Convert the extracted string to a datetime data type
df = df.withColumn("datetime", to_timestamp("datetime_str", "yyyy_MM_dd'T'HH_mm_ss"))

# Show the result
df.show(truncate=False)&lt;/LI-CODE&gt;</description>
      <pubDate>Tue, 24 Dec 2024 12:16:43 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/extract-datetime-value-from-the-file-name/m-p/103132#M41340</guid>
      <dc:creator>Walter_C</dc:creator>
      <dc:date>2024-12-24T12:16:43Z</dc:date>
    </item>
    <item>
      <title>Re: Extract datetime value from the file name</title>
      <link>https://community.databricks.com/t5/data-engineering/extract-datetime-value-from-the-file-name/m-p/103135#M41342</link>
      <description>&lt;P&gt;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/88823"&gt;@Walter_C&lt;/a&gt;&amp;nbsp;thanks for your help. Can't we do it with split_part or substr function to extract the datetime value and then apply the datetime format?&lt;/P&gt;</description>
      <pubDate>Tue, 24 Dec 2024 13:04:05 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/extract-datetime-value-from-the-file-name/m-p/103135#M41342</guid>
      <dc:creator>David_Billa</dc:creator>
      <dc:date>2024-12-24T13:04:05Z</dc:date>
    </item>
    <item>
      <title>Re: Extract datetime value from the file name</title>
      <link>https://community.databricks.com/t5/data-engineering/extract-datetime-value-from-the-file-name/m-p/103138#M41344</link>
      <description>&lt;P&gt;I got same result as you when using split as I am getting Null result, another option I can suggest to only get the datetime is:&lt;BR /&gt;&lt;BR /&gt;&lt;/P&gt;
&lt;LI-CODE lang="markup"&gt;import re
from datetime import datetime

filename = "This_is_new_file_2024_12_06T11_00_49_AM.csv"

match = re.search(r"\d{4}_\d{2}_\d{2}T\d{2}_\d{2}_\d{2}", filename)
datetime_string = match.group(0) if match else None

if datetime_string:
    datetime_object = datetime.strptime(datetime_string, "%Y_%m_%dT%H_%M_%S")
    print(datetime_object)
else:
    print("Datetime not found in the filename")
&lt;/LI-CODE&gt;</description>
      <pubDate>Tue, 24 Dec 2024 13:49:00 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/extract-datetime-value-from-the-file-name/m-p/103138#M41344</guid>
      <dc:creator>Walter_C</dc:creator>
      <dc:date>2024-12-24T13:49:00Z</dc:date>
    </item>
    <item>
      <title>Re: Extract datetime value from the file name</title>
      <link>https://community.databricks.com/t5/data-engineering/extract-datetime-value-from-the-file-name/m-p/103141#M41347</link>
      <description>&lt;P&gt;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/88823"&gt;@Walter_C&lt;/a&gt;&amp;nbsp;So not possible to achieve this result with SQL functions and datetime format? Reason is I want to incorporate this solution in the existing SELECT statement&lt;/P&gt;</description>
      <pubDate>Tue, 24 Dec 2024 14:16:25 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/extract-datetime-value-from-the-file-name/m-p/103141#M41347</guid>
      <dc:creator>David_Billa</dc:creator>
      <dc:date>2024-12-24T14:16:25Z</dc:date>
    </item>
    <item>
      <title>Re: Extract datetime value from the file name</title>
      <link>https://community.databricks.com/t5/data-engineering/extract-datetime-value-from-the-file-name/m-p/103144#M41348</link>
      <description>&lt;P&gt;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/110498"&gt;@David_Billa&lt;/a&gt;&amp;nbsp;how about&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;LI-CODE lang="python"&gt;import pyspark.sql.functions as F

df_with_datetime = df.withColumn(
    'extracted_datetime',
    F.to_timestamp(
        F.concat(
            *[F.split_part(F.col('file_name'), F.lit("_"), F.lit(i)) for i in range(-6, -1)]
        ),
        'yyyyMMdd\'T\'HHmmss'
    )
)

display(df_with_datetime)&lt;/LI-CODE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;or&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;LI-CODE lang="python"&gt;SELECT *,
       to_timestamp(
           concat_ws('', 
               split_part(file_name, '_', -6),
               split_part(file_name, '_', -5),
               split_part(file_name, '_', -4),
               split_part(file_name, '_', -3),
               split_part(file_name, '_', -2)
           ),
           'yyyyMMdd\'T\'HHmmss'
       ) AS extracted_datetime
FROM df&lt;/LI-CODE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Tue, 24 Dec 2024 14:39:27 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/extract-datetime-value-from-the-file-name/m-p/103144#M41348</guid>
      <dc:creator>ck1</dc:creator>
      <dc:date>2024-12-24T14:39:27Z</dc:date>
    </item>
    <item>
      <title>Re: Extract datetime value from the file name</title>
      <link>https://community.databricks.com/t5/data-engineering/extract-datetime-value-from-the-file-name/m-p/103145#M41349</link>
      <description>&lt;P&gt;Unfortunately I am not able to make it work with SQL functions&lt;/P&gt;</description>
      <pubDate>Tue, 24 Dec 2024 14:42:38 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/extract-datetime-value-from-the-file-name/m-p/103145#M41349</guid>
      <dc:creator>Walter_C</dc:creator>
      <dc:date>2024-12-24T14:42:38Z</dc:date>
    </item>
    <item>
      <title>Re: Extract datetime value from the file name</title>
      <link>https://community.databricks.com/t5/data-engineering/extract-datetime-value-from-the-file-name/m-p/103157#M41352</link>
      <description>&lt;P&gt;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/106756"&gt;@ck1&lt;/a&gt;&amp;nbsp;looks good now. I see that date time value is like 2024-12-06T11:00:49.000+00.00.&lt;/P&gt;&lt;P&gt;Can't we ignore&amp;nbsp;.000+00.00?&lt;/P&gt;&lt;P&gt;Any help here&amp;nbsp;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/88823"&gt;@Walter_C&lt;/a&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Tue, 24 Dec 2024 18:06:02 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/extract-datetime-value-from-the-file-name/m-p/103157#M41352</guid>
      <dc:creator>David_Billa</dc:creator>
      <dc:date>2024-12-24T18:06:02Z</dc:date>
    </item>
    <item>
      <title>Re: Extract datetime value from the file name</title>
      <link>https://community.databricks.com/t5/data-engineering/extract-datetime-value-from-the-file-name/m-p/103306#M41400</link>
      <description>&lt;P&gt;I think its not really possible, though I am quite new to Databricks. Here are the types that one can use:&amp;nbsp;&lt;A href="https://docs.databricks.com/en/sql/language-manual/sql-ref-datatypes.html" target="_blank"&gt;Data types | Databricks on AWS&lt;/A&gt;&lt;BR /&gt;There is a date type and a timestamp type, but it doesn't look like there is something in between. (You could of course save a string representation of the datetime...)&lt;/P&gt;</description>
      <pubDate>Fri, 27 Dec 2024 14:20:04 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/extract-datetime-value-from-the-file-name/m-p/103306#M41400</guid>
      <dc:creator>ck1</dc:creator>
      <dc:date>2024-12-27T14:20:04Z</dc:date>
    </item>
  </channel>
</rss>

