<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Convert pdf's is into structured data in Data Engineering</title>
    <link>https://community.databricks.com/t5/data-engineering/convert-pdf-s-is-into-structured-data/m-p/21307#M14507</link>
    <description>&lt;P&gt;Is there anything on Databricks to help read PDF (payment invoices and receipts for example) and convert it to structured data?&lt;/P&gt;</description>
    <pubDate>Wed, 23 Jun 2021 19:43:14 GMT</pubDate>
    <dc:creator>User16826987838</dc:creator>
    <dc:date>2021-06-23T19:43:14Z</dc:date>
    <item>
      <title>Convert pdf's is into structured data</title>
      <link>https://community.databricks.com/t5/data-engineering/convert-pdf-s-is-into-structured-data/m-p/21307#M14507</link>
      <description>&lt;P&gt;Is there anything on Databricks to help read PDF (payment invoices and receipts for example) and convert it to structured data?&lt;/P&gt;</description>
      <pubDate>Wed, 23 Jun 2021 19:43:14 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/convert-pdf-s-is-into-structured-data/m-p/21307#M14507</guid>
      <dc:creator>User16826987838</dc:creator>
      <dc:date>2021-06-23T19:43:14Z</dc:date>
    </item>
    <item>
      <title>Re: Convert pdf's is into structured data</title>
      <link>https://community.databricks.com/t5/data-engineering/convert-pdf-s-is-into-structured-data/m-p/21308#M14508</link>
      <description>&lt;P&gt;Several open source options. For ex  Tesseract&lt;/P&gt;&lt;PRE&gt;&lt;CODE&gt;def ocr_image(image_bytes):
&amp;nbsp;
  return pytesseract.image_to_string(Image.open(io.BytesIO(image_bytes)))&lt;/CODE&gt;&lt;/PRE&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;&lt;/P&gt;</description>
      <pubDate>Wed, 23 Jun 2021 21:05:06 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/convert-pdf-s-is-into-structured-data/m-p/21308#M14508</guid>
      <dc:creator>Anonymous</dc:creator>
      <dc:date>2021-06-23T21:05:06Z</dc:date>
    </item>
    <item>
      <title>Re: Convert pdf's is into structured data</title>
      <link>https://community.databricks.com/t5/data-engineering/convert-pdf-s-is-into-structured-data/m-p/62908#M32132</link>
      <description>&lt;P&gt;&lt;SPAN&gt;Thanks! Converting PDF format is sometimes a difficult task as not all converters provide accuracy. I want to share with you one interesting tool I recently discovered that can make your work even more efficient. I recently came across an amazing online tool &lt;A href="https://pdfflex.com/docx-to-pdf" target="_blank"&gt;https://pdfflex.com/docx-to-pdf&lt;/A&gt;&amp;nbsp; that allows you to convert DOCX to PDF effortlessly. All you have to do is upload your DOCX file and it will be converted in seconds. And you can easily download the pdf file.&amp;nbsp;&lt;/SPAN&gt;&lt;/P&gt;</description>
      <pubDate>Thu, 07 Mar 2024 15:17:08 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/convert-pdf-s-is-into-structured-data/m-p/62908#M32132</guid>
      <dc:creator>SoniaFoster</dc:creator>
      <dc:date>2024-03-07T15:17:08Z</dc:date>
    </item>
  </channel>
</rss>

