<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Re: Get metadata of files present in a zip in Data Engineering</title>
    <link>https://community.databricks.com/t5/data-engineering/get-metadata-of-files-present-in-a-zip/m-p/83377#M36919</link>
    <description>&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;Here is the code which i am using&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;LI-CODE lang="python"&gt;def register_udf():
    def extract_file_metadata_from_zip(binary_content):
        metadata_list = []
        with io.BytesIO(binary_content) as bio:
            with zipfile.ZipFile(bio, "r") as zip_ref:
                for file_info in zip_ref.infolist():
                    file_name = file_info.filename
                    modification_time = datetime.datetime(*file_info.date_time)
                    metadata_list.append((file_name, modification_time))
        return metadata_list
    meta_schema = ArrayType(
        StructType(
            [
                StructField("file_name", StringType(), True),
                StructField("modification_time", TimestampType(), True),
            ]
        )
    )
    extract_metadata_udf = udf(extract_file_metadata_from_zip, meta_schema)
    return extract_metadata_udf
    
    
def get_last_modification_times(zip_file_path, expected_date, extract_metadata_udf):
    try:
        zip_file_df = (
            spark.read.format("binaryFile")
            .option("pathGlobFilter", "*.zip")
            .load(zip_file_path)
        )
        extracted_metadata_df = zip_file_df.withColumn(
            "file_metadata", extract_metadata_udf(col("content"))
        )
        exploded_metadata_df = extracted_metadata_df.select(
            explode("file_metadata").alias("metadata")
        )
        return exploded_metadata_df 
    except Exception as e:
        print("An error occurred: ", str(e))&lt;/LI-CODE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
    <pubDate>Mon, 19 Aug 2024 02:23:23 GMT</pubDate>
    <dc:creator>seeker</dc:creator>
    <dc:date>2024-08-19T02:23:23Z</dc:date>
    <item>
      <title>Get metadata of files present in a zip</title>
      <link>https://community.databricks.com/t5/data-engineering/get-metadata-of-files-present-in-a-zip/m-p/83376#M36918</link>
      <description>&lt;P&gt;I have a .zip file present on an ADLS path which contains multiple files of different formats. I want to get metadata of the files like file name, modification time present in it without unzipping it. I have a code which works for smaller zip but runs into memory issues for large zip files leading to job failures. Is there a way to handle this within pyspark itself?&lt;/P&gt;</description>
      <pubDate>Mon, 19 Aug 2024 02:05:51 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/get-metadata-of-files-present-in-a-zip/m-p/83376#M36918</guid>
      <dc:creator>seeker</dc:creator>
      <dc:date>2024-08-19T02:05:51Z</dc:date>
    </item>
    <item>
      <title>Re: Get metadata of files present in a zip</title>
      <link>https://community.databricks.com/t5/data-engineering/get-metadata-of-files-present-in-a-zip/m-p/83377#M36919</link>
      <description>&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;Here is the code which i am using&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;LI-CODE lang="python"&gt;def register_udf():
    def extract_file_metadata_from_zip(binary_content):
        metadata_list = []
        with io.BytesIO(binary_content) as bio:
            with zipfile.ZipFile(bio, "r") as zip_ref:
                for file_info in zip_ref.infolist():
                    file_name = file_info.filename
                    modification_time = datetime.datetime(*file_info.date_time)
                    metadata_list.append((file_name, modification_time))
        return metadata_list
    meta_schema = ArrayType(
        StructType(
            [
                StructField("file_name", StringType(), True),
                StructField("modification_time", TimestampType(), True),
            ]
        )
    )
    extract_metadata_udf = udf(extract_file_metadata_from_zip, meta_schema)
    return extract_metadata_udf
    
    
def get_last_modification_times(zip_file_path, expected_date, extract_metadata_udf):
    try:
        zip_file_df = (
            spark.read.format("binaryFile")
            .option("pathGlobFilter", "*.zip")
            .load(zip_file_path)
        )
        extracted_metadata_df = zip_file_df.withColumn(
            "file_metadata", extract_metadata_udf(col("content"))
        )
        exploded_metadata_df = extracted_metadata_df.select(
            explode("file_metadata").alias("metadata")
        )
        return exploded_metadata_df 
    except Exception as e:
        print("An error occurred: ", str(e))&lt;/LI-CODE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Mon, 19 Aug 2024 02:23:23 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/get-metadata-of-files-present-in-a-zip/m-p/83377#M36919</guid>
      <dc:creator>seeker</dc:creator>
      <dc:date>2024-08-19T02:23:23Z</dc:date>
    </item>
    <item>
      <title>Re: Get metadata of files present in a zip</title>
      <link>https://community.databricks.com/t5/data-engineering/get-metadata-of-files-present-in-a-zip/m-p/83451#M36936</link>
      <description>&lt;P&gt;Hi&amp;nbsp;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/116518"&gt;@seeker&lt;/a&gt;&amp;nbsp;,&lt;/P&gt;&lt;P&gt;I'm afraid there is no easy way to do that in pyspark.&amp;nbsp;Spark supports the following compression formats:&lt;/P&gt;&lt;UL&gt;&lt;LI&gt;bzip2&lt;/LI&gt;&lt;LI&gt;deflate&lt;/LI&gt;&lt;LI&gt;snappy&lt;/LI&gt;&lt;LI&gt;lz4&lt;/LI&gt;&lt;LI&gt;gzip&lt;/LI&gt;&lt;/UL&gt;&lt;P&gt;Thus, there is no native support for the&lt;SPAN&gt;&amp;nbsp;&lt;/SPAN&gt;zip&lt;SPAN&gt;&amp;nbsp;&lt;/SPAN&gt;format. And your solution will be slow, because you are using UDF which means it will apply this function on every row &lt;span class="lia-unicode-emoji" title=":confused_face:"&gt;😕&lt;/span&gt;&lt;/P&gt;</description>
      <pubDate>Mon, 19 Aug 2024 13:34:33 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/get-metadata-of-files-present-in-a-zip/m-p/83451#M36936</guid>
      <dc:creator>szymon_dybczak</dc:creator>
      <dc:date>2024-08-19T13:34:33Z</dc:date>
    </item>
    <item>
      <title>Re: Get metadata of files present in a zip</title>
      <link>https://community.databricks.com/t5/data-engineering/get-metadata-of-files-present-in-a-zip/m-p/83589#M36969</link>
      <description>&lt;P&gt;Hi&amp;nbsp;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/116518"&gt;@seeker&lt;/a&gt;, Thanks for reaching out!&lt;/P&gt;
&lt;P&gt;Please review the responses and let us know which best addresses your question. Your feedback is valuable to us and the community.&lt;/P&gt;
&lt;P&gt;If the response resolves your issue, kindly mark it as the accepted solution. This will help close the thread and assist others with similar queries.&lt;/P&gt;
&lt;P&gt;We appreciate your participation and are here if you need further assistance!&lt;/P&gt;</description>
      <pubDate>Tue, 20 Aug 2024 11:43:20 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/get-metadata-of-files-present-in-a-zip/m-p/83589#M36969</guid>
      <dc:creator>Retired_mod</dc:creator>
      <dc:date>2024-08-20T11:43:20Z</dc:date>
    </item>
    <item>
      <title>Re: Get metadata of files present in a zip</title>
      <link>https://community.databricks.com/t5/data-engineering/get-metadata-of-files-present-in-a-zip/m-p/83671#M36980</link>
      <description>&lt;P&gt;Hi&amp;nbsp;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/116518"&gt;@seeker&lt;/a&gt;,&amp;nbsp;There are only 2 ways I can think of to do it:&lt;/P&gt;
&lt;UL class="p-rich_text_list p-rich_text_list__bullet" data-stringify-type="unordered-list" data-indent="0" data-border="0"&gt;
&lt;LI data-stringify-indent="0" data-stringify-border="0"&gt;Write a UDF.&lt;/LI&gt;
&lt;LI data-stringify-indent="0" data-stringify-border="0"&gt;Write customized MapReduce logic instead of using Spark SQL.&lt;/LI&gt;
&lt;/UL&gt;
&lt;DIV class="p-rich_text_section"&gt;But they are kind of the same. So I would say UDF is a good solution.&lt;/DIV&gt;</description>
      <pubDate>Tue, 20 Aug 2024 18:55:36 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/get-metadata-of-files-present-in-a-zip/m-p/83671#M36980</guid>
      <dc:creator>Retired_mod</dc:creator>
      <dc:date>2024-08-20T18:55:36Z</dc:date>
    </item>
  </channel>
</rss>

