<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic best practice to parse binary file like pdf, png, docx in Data Engineering</title>
    <link>https://community.databricks.com/t5/data-engineering/best-practice-to-parse-binary-file-like-pdf-png-docx/m-p/147921#M52796</link>
    <description>&lt;P&gt;Hello Guyz,&amp;nbsp;&lt;BR /&gt;&lt;BR /&gt;i have use autoloader to load pdf file like binary source, So i dont want to use ai_parse for databricks but i use doclin someone knows how to use it well?&amp;nbsp;&lt;BR /&gt;&lt;BR /&gt;my df look like&amp;nbsp;&lt;BR /&gt;&lt;BR /&gt;&lt;BR /&gt;&lt;/P&gt;&lt;LI-CODE lang="python"&gt;from pyspark.sql import functions as F
df = (
    spark.readStream.format("cloudFiles")
    .option("cloudFiles.format", "binaryFile")
    .option("pathGlobFilter", "*.pdf")
    .load("/Volumes/default/landing/unstructured_data/client")
    .selectExpr("*", "_metadata as _source_file_metadata")
    .withColumns(
        {
            "ingestion_timestamp": F.current_timestamp(),
            "file_extension": F.lower(F.element_at(F.split("path", "\\."), -1)),
            "file_size_mb": col("length") / (1024 * 1024),
            "document_id": F.sha1(
                F.concat(
                    F.col("_source_file_metadata.file_name"), col("modificationTime")
                )
            ),
            "supported_format": F.when(
                F.col("file_extension").isin(
                    ["pdf", "PDF", "png", "PNG", "jpg", "JPG", "jpeg", "JPEG"]
                ),
                F.lit(True),
            ).otherwise(F.lit(False)),
        }
    )
)&lt;/LI-CODE&gt;&lt;P&gt;&lt;BR /&gt;&lt;BR /&gt;Cordially,&amp;nbsp;&lt;BR /&gt;&lt;BR /&gt;&lt;/P&gt;</description>
    <pubDate>Tue, 10 Feb 2026 15:31:12 GMT</pubDate>
    <dc:creator>seefoods</dc:creator>
    <dc:date>2026-02-10T15:31:12Z</dc:date>
    <item>
      <title>best practice to parse binary file like pdf, png, docx</title>
      <link>https://community.databricks.com/t5/data-engineering/best-practice-to-parse-binary-file-like-pdf-png-docx/m-p/147921#M52796</link>
      <description>&lt;P&gt;Hello Guyz,&amp;nbsp;&lt;BR /&gt;&lt;BR /&gt;i have use autoloader to load pdf file like binary source, So i dont want to use ai_parse for databricks but i use doclin someone knows how to use it well?&amp;nbsp;&lt;BR /&gt;&lt;BR /&gt;my df look like&amp;nbsp;&lt;BR /&gt;&lt;BR /&gt;&lt;BR /&gt;&lt;/P&gt;&lt;LI-CODE lang="python"&gt;from pyspark.sql import functions as F
df = (
    spark.readStream.format("cloudFiles")
    .option("cloudFiles.format", "binaryFile")
    .option("pathGlobFilter", "*.pdf")
    .load("/Volumes/default/landing/unstructured_data/client")
    .selectExpr("*", "_metadata as _source_file_metadata")
    .withColumns(
        {
            "ingestion_timestamp": F.current_timestamp(),
            "file_extension": F.lower(F.element_at(F.split("path", "\\."), -1)),
            "file_size_mb": col("length") / (1024 * 1024),
            "document_id": F.sha1(
                F.concat(
                    F.col("_source_file_metadata.file_name"), col("modificationTime")
                )
            ),
            "supported_format": F.when(
                F.col("file_extension").isin(
                    ["pdf", "PDF", "png", "PNG", "jpg", "JPG", "jpeg", "JPEG"]
                ),
                F.lit(True),
            ).otherwise(F.lit(False)),
        }
    )
)&lt;/LI-CODE&gt;&lt;P&gt;&lt;BR /&gt;&lt;BR /&gt;Cordially,&amp;nbsp;&lt;BR /&gt;&lt;BR /&gt;&lt;/P&gt;</description>
      <pubDate>Tue, 10 Feb 2026 15:31:12 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/best-practice-to-parse-binary-file-like-pdf-png-docx/m-p/147921#M52796</guid>
      <dc:creator>seefoods</dc:creator>
      <dc:date>2026-02-10T15:31:12Z</dc:date>
    </item>
    <item>
      <title>Re: best practice to parse binary file like pdf, png, docx</title>
      <link>https://community.databricks.com/t5/data-engineering/best-practice-to-parse-binary-file-like-pdf-png-docx/m-p/147966#M52802</link>
      <description>&lt;P&gt;Is the question that you want to use docling inside the stream to do the conversion?&lt;/P&gt;
&lt;P&gt;If so you can&amp;nbsp;&lt;U&gt;pip install docling&lt;/U&gt; in your notebook and then use it inside forEachBatch etc.&lt;/P&gt;
&lt;P&gt;This code here is just from Google to show you the general structure:&lt;/P&gt;
&lt;PRE data-processed="true"&gt;&lt;CODE data-processed="true"&gt;&lt;SPAN class="mexSqb" data-processed="true"&gt;def&lt;/SPAN&gt; &lt;SPAN class="Ff1zF" data-processed="true"&gt;process_row&lt;/SPAN&gt;&lt;SPAN class="undefined" data-processed="true"&gt;(&lt;/SPAN&gt;&lt;SPAN class="Ff1zF" data-processed="true"&gt;row&lt;/SPAN&gt;&lt;SPAN class="undefined" data-processed="true"&gt;):
    &lt;/SPAN&gt;&lt;SPAN class="ClTQqc" data-processed="true"&gt;# Custom logic for each row&lt;/SPAN&gt;&lt;SPAN class="undefined" data-processed="true"&gt;
    print(row)

&lt;/SPAN&gt;&lt;SPAN class="mexSqb" data-processed="true"&gt;def&lt;/SPAN&gt; &lt;SPAN class="Ff1zF" data-processed="true"&gt;foreach_batch_function&lt;/SPAN&gt;&lt;SPAN class="undefined" data-processed="true"&gt;(&lt;/SPAN&gt;&lt;SPAN class="Ff1zF" data-processed="true"&gt;batch_df&lt;/SPAN&gt;&lt;SPAN class="undefined" data-processed="true"&gt;, &lt;/SPAN&gt;&lt;SPAN class="Ff1zF" data-processed="true"&gt;epoch_id&lt;/SPAN&gt;&lt;SPAN class="undefined" data-processed="true"&gt;):
    &lt;/SPAN&gt;&lt;SPAN class="ClTQqc" data-processed="true"&gt;# Efficiently iterate over local rows on the driver&lt;/SPAN&gt;
    &lt;SPAN class="mexSqb" data-processed="true"&gt;for&lt;/SPAN&gt; &lt;SPAN class="Ff1zF" data-processed="true"&gt;row&lt;/SPAN&gt; &lt;SPAN class="mexSqb" data-processed="true"&gt;in&lt;/SPAN&gt;&lt;SPAN class="undefined" data-processed="true"&gt; batch_df.toLocalIterator():
        process_row(row)

streaming_df.writeStream.foreachBatch(foreach_batch_function).start()&lt;/SPAN&gt;&lt;/CODE&gt;&lt;/PRE&gt;
&lt;P&gt;&amp;nbsp;Here's pseudo-code (untested) for how you might want to do it:&lt;/P&gt;
&lt;LI-CODE lang="markup"&gt;def parse_batch(batch_df, batch_id: int):
    import io, json, tempfile
    from docling.document_converter import DocumentConverter

    # Reuse one converter per micro-batch
    converter = DocumentConverter()  # defaults handle PDF parsing well :llmCitationRef[7]

    rows = []
    # toLocalIterator streams rows without materializing entire batch on driver
    for row in batch_df.toLocalIterator():
        # Write binary content to a temp file; convert accepts file path :llmCitationRef[8]
        with tempfile.NamedTemporaryFile(suffix=".pdf", delete=True) as tmp:
            tmp.write(row.content)
            tmp.flush()
            result = converter.convert(tmp.name)  # returns conversion result :llmCitationRef[9]
            doc = result.document  # DoclingDocument :llmCitationRef[10]
            md = doc.export_to_markdown()         # Markdown export :llmCitationRef[11]
            j = json.dumps(doc.export_to_dict())  # JSON export :llmCitationRef[12]
            rows.append((row.document_id, row.path, md, j))

    if rows:
        out_df = spark.createDataFrame(
            rows, ["document_id", "path", "markdown", "json"]
        )
        out_df.write.mode("append").saveAsTable(target_table)

query = (
    df.writeStream
    .option("checkpointLocation", "/Volumes/default/checkpoints/unstructured_docs_docling")
    .foreachBatch(parse_batch)
    .start()
)&lt;/LI-CODE&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Wed, 11 Feb 2026 00:14:31 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/best-practice-to-parse-binary-file-like-pdf-png-docx/m-p/147966#M52802</guid>
      <dc:creator>MoJaMa</dc:creator>
      <dc:date>2026-02-11T00:14:31Z</dc:date>
    </item>
  </channel>
</rss>

