<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Re: PDF Parsing in Notebook in Data Engineering</title>
    <link>https://community.databricks.com/t5/data-engineering/pdf-parsing-in-notebook/m-p/108453#M43052</link>
    <description>&lt;P&gt;PDF Data Source works now on Databricks.&lt;BR /&gt;Instruction with example: &lt;A href="https://stabrise.com/blog/spark-pdf-on-databricks/" target="_self"&gt;https://stabrise.com/blog/spark-pdf-on-databricks/&lt;/A&gt;&lt;/P&gt;</description>
    <pubDate>Sun, 02 Feb 2025 17:17:29 GMT</pubDate>
    <dc:creator>Mykola_Melnyk</dc:creator>
    <dc:date>2025-02-02T17:17:29Z</dc:date>
    <item>
      <title>PDF Parsing in Notebook</title>
      <link>https://community.databricks.com/t5/data-engineering/pdf-parsing-in-notebook/m-p/14636#M9108</link>
      <description>&lt;P&gt;I have pdf files stored in azure adls.&lt;/P&gt;&lt;P&gt;i want to parse pdf files in pyspark dataframes&lt;/P&gt;&lt;P&gt;how can i do that ?&lt;/P&gt;&lt;P&gt;&lt;/P&gt;</description>
      <pubDate>Thu, 23 Sep 2021 08:37:09 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/pdf-parsing-in-notebook/m-p/14636#M9108</guid>
      <dc:creator>Kamal2</dc:creator>
      <dc:date>2021-09-23T08:37:09Z</dc:date>
    </item>
    <item>
      <title>Re: PDF Parsing in Notebook</title>
      <link>https://community.databricks.com/t5/data-engineering/pdf-parsing-in-notebook/m-p/14638#M9110</link>
      <description>&lt;P&gt;I know of &lt;A href="https://tika.apache.org" alt="https://tika.apache.org" target="_blank"&gt;Apache Tika&lt;/A&gt;.  But that is a java lib and I do not know if there are python bindings.&lt;/P&gt;&lt;P&gt;Pypi has a python version though:&lt;/P&gt;&lt;P&gt;&lt;A href="https://pypi.org/project/tika/" alt="https://pypi.org/project/tika/" target="_blank"&gt;https://pypi.org/project/tika/&lt;/A&gt;&lt;/P&gt;&lt;P&gt;It might help.&lt;/P&gt;</description>
      <pubDate>Thu, 23 Sep 2021 12:31:42 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/pdf-parsing-in-notebook/m-p/14638#M9110</guid>
      <dc:creator>-werners-</dc:creator>
      <dc:date>2021-09-23T12:31:42Z</dc:date>
    </item>
    <item>
      <title>Re: PDF Parsing in Notebook</title>
      <link>https://community.databricks.com/t5/data-engineering/pdf-parsing-in-notebook/m-p/14639#M9111</link>
      <description>&lt;P&gt;If you have familiarity with Scala you can use &lt;A href="https://tika.apache.org/" alt="https://tika.apache.org/" target="_blank"&gt;Tika&lt;/A&gt;. Tika is a wrapper around PDFBox. In case you want to use it in Databricks I suggest you to go through this &lt;A href="https://medium.com/@debusinha2009/processing-pdf-data-with-apache-pdfbox-and-apache-spark-at-scale-on-databricks-85b4f8daee78" alt="https://medium.com/@debusinha2009/processing-pdf-data-with-apache-pdfbox-and-apache-spark-at-scale-on-databricks-85b4f8daee78" target="_blank"&gt;blog&lt;/A&gt; and &lt;A href="https://github.com/debu-sinha/PDFBox-Databricks" alt="https://github.com/debu-sinha/PDFBox-Databricks" target="_blank"&gt;Git repo&lt;/A&gt;. For python based codes you may want to use &lt;A href="https://pypi.org/project/PyPDF2/" alt="https://pypi.org/project/PyPDF2/" target="_blank"&gt;PyPDF2  &lt;/A&gt;as a &lt;A href="https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.functions.pandas_udf.html" alt="https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.functions.pandas_udf.html" target="_blank"&gt;pandas UDF&lt;/A&gt; in Spark. &lt;/P&gt;</description>
      <pubDate>Fri, 15 Oct 2021 15:31:23 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/pdf-parsing-in-notebook/m-p/14639#M9111</guid>
      <dc:creator>morganmazouchi</dc:creator>
      <dc:date>2021-10-15T15:31:23Z</dc:date>
    </item>
    <item>
      <title>Re: PDF Parsing in Notebook</title>
      <link>https://community.databricks.com/t5/data-engineering/pdf-parsing-in-notebook/m-p/100045#M40178</link>
      <description>&lt;P&gt;Please look to the&amp;nbsp;&lt;A href="https://github.com/StabRise/spark-pdf" target="_self"&gt;PDF DataSource for Apache Spark.&lt;/A&gt;&lt;/P&gt;&lt;P&gt;This project provides a custom data source for the&lt;SPAN&gt;&amp;nbsp;Apache Spark&amp;nbsp;&lt;/SPAN&gt;that allows you to read PDF files into the Spark DataFrame. And &lt;A href="https://github.com/StabRise/spark-pdf/blob/main/examples/PdfDataSource.ipynb" target="_self"&gt;here&lt;/A&gt; notebook with example of usage.&lt;/P&gt;&lt;LI-CODE lang="python"&gt;df = spark.read.format("pdf") \
    .option("imageType", "BINARY") \
    .option("resolution", "200") \
    .option("pagePerPartition", "2") \
    .option("reader", "pdfBox") \
    .load("path to the pdf file(s)")

df.show()&lt;/LI-CODE&gt;&lt;P&gt;&amp;nbsp;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Tue, 26 Nov 2024 10:06:46 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/pdf-parsing-in-notebook/m-p/100045#M40178</guid>
      <dc:creator>Mykola_Melnyk</dc:creator>
      <dc:date>2024-11-26T10:06:46Z</dc:date>
    </item>
    <item>
      <title>Re: PDF Parsing in Notebook</title>
      <link>https://community.databricks.com/t5/data-engineering/pdf-parsing-in-notebook/m-p/108453#M43052</link>
      <description>&lt;P&gt;PDF Data Source works now on Databricks.&lt;BR /&gt;Instruction with example: &lt;A href="https://stabrise.com/blog/spark-pdf-on-databricks/" target="_self"&gt;https://stabrise.com/blog/spark-pdf-on-databricks/&lt;/A&gt;&lt;/P&gt;</description>
      <pubDate>Sun, 02 Feb 2025 17:17:29 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/pdf-parsing-in-notebook/m-p/108453#M43052</guid>
      <dc:creator>Mykola_Melnyk</dc:creator>
      <dc:date>2025-02-02T17:17:29Z</dc:date>
    </item>
    <item>
      <title>Re: PDF Parsing in Notebook</title>
      <link>https://community.databricks.com/t5/data-engineering/pdf-parsing-in-notebook/m-p/115535#M45107</link>
      <description>&lt;P&gt;Spark PDF works now with Unity Catalog volumes, started from 0.1.16 version: more details here:&amp;nbsp;&lt;STRONG&gt;&lt;SPAN&gt;&lt;A href="https://stabrise.com/blog/spark-pdf-databricks-unity-catalog/" target="_blank"&gt;https://stabrise.com/blog/spark-pdf-databricks-unity-catalog/&lt;/A&gt;&lt;A href="https://stabrise.com/blog/spark-pdf-databricks-unity-catalog/" target="_self"&gt;&lt;BR /&gt;&lt;/A&gt;&lt;/SPAN&gt;&lt;/STRONG&gt;&lt;/P&gt;</description>
      <pubDate>Tue, 15 Apr 2025 14:22:29 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/pdf-parsing-in-notebook/m-p/115535#M45107</guid>
      <dc:creator>Mykola_Melnyk</dc:creator>
      <dc:date>2025-04-15T14:22:29Z</dc:date>
    </item>
  </channel>
</rss>

