<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Re: Gathering Data Off Of A PDF File in Get Started Discussions</title>
    <link>https://community.databricks.com/t5/get-started-discussions/gathering-data-off-of-a-pdf-file/m-p/102963#M9725</link>
    <description>&lt;P&gt;I won't be able to see the answer, can you please share it?&lt;/P&gt;</description>
    <pubDate>Mon, 23 Dec 2024 08:36:26 GMT</pubDate>
    <dc:creator>higunjan</dc:creator>
    <dc:date>2024-12-23T08:36:26Z</dc:date>
    <item>
      <title>Gathering Data Off Of A PDF File</title>
      <link>https://community.databricks.com/t5/get-started-discussions/gathering-data-off-of-a-pdf-file/m-p/75251#M9721</link>
      <description>&lt;P&gt;Hello everyone,&lt;/P&gt;&lt;P&gt;I am developing an application that accepts pdf files and inserts the data into my database. The company in question that distributes this data to us only offers PDF files, which you can see attached below (I hid personal info for privacy reasons). Do any of you know of a tool or route that would make this doable?&lt;/P&gt;&lt;P&gt;Any advice or ideas would be appreciated.&lt;/P&gt;&lt;P&gt;Thank you!&lt;/P&gt;</description>
      <pubDate>Fri, 21 Jun 2024 00:51:16 GMT</pubDate>
      <guid>https://community.databricks.com/t5/get-started-discussions/gathering-data-off-of-a-pdf-file/m-p/75251#M9721</guid>
      <dc:creator>trimethylpurine</dc:creator>
      <dc:date>2024-06-21T00:51:16Z</dc:date>
    </item>
    <item>
      <title>Re: Gathering Data Off Of A PDF File</title>
      <link>https://community.databricks.com/t5/get-started-discussions/gathering-data-off-of-a-pdf-file/m-p/75362#M9723</link>
      <description>&lt;P&gt;Hello&amp;nbsp;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/9"&gt;@Retired_mod&lt;/a&gt;, I really appreciate your help!&lt;/P&gt;&lt;P&gt;This makes a lot of sense.&lt;/P&gt;</description>
      <pubDate>Fri, 21 Jun 2024 16:33:57 GMT</pubDate>
      <guid>https://community.databricks.com/t5/get-started-discussions/gathering-data-off-of-a-pdf-file/m-p/75362#M9723</guid>
      <dc:creator>trimethylpurine</dc:creator>
      <dc:date>2024-06-21T16:33:57Z</dc:date>
    </item>
    <item>
      <title>Re: Gathering Data Off Of A PDF File</title>
      <link>https://community.databricks.com/t5/get-started-discussions/gathering-data-off-of-a-pdf-file/m-p/78519#M9724</link>
      <description>&lt;P&gt;Thank you so much for the help.&lt;/P&gt;</description>
      <pubDate>Fri, 12 Jul 2024 12:09:05 GMT</pubDate>
      <guid>https://community.databricks.com/t5/get-started-discussions/gathering-data-off-of-a-pdf-file/m-p/78519#M9724</guid>
      <dc:creator>NicholasGray</dc:creator>
      <dc:date>2024-07-12T12:09:05Z</dc:date>
    </item>
    <item>
      <title>Re: Gathering Data Off Of A PDF File</title>
      <link>https://community.databricks.com/t5/get-started-discussions/gathering-data-off-of-a-pdf-file/m-p/102963#M9725</link>
      <description>&lt;P&gt;I won't be able to see the answer, can you please share it?&lt;/P&gt;</description>
      <pubDate>Mon, 23 Dec 2024 08:36:26 GMT</pubDate>
      <guid>https://community.databricks.com/t5/get-started-discussions/gathering-data-off-of-a-pdf-file/m-p/102963#M9725</guid>
      <dc:creator>higunjan</dc:creator>
      <dc:date>2024-12-23T08:36:26Z</dc:date>
    </item>
    <item>
      <title>Re: Gathering Data Off Of A PDF File</title>
      <link>https://community.databricks.com/t5/get-started-discussions/gathering-data-off-of-a-pdf-file/m-p/108457#M9726</link>
      <description>&lt;P&gt;You can use PDF Data Source for read data from pdf files. Examples here:&amp;nbsp;&lt;A href="https://stabrise.com/blog/spark-pdf-on-databricks/" target="_blank" rel="noopener"&gt;https://stabrise.com/blog/spark-pdf-on-databricks/&lt;/A&gt;&lt;BR /&gt;&lt;BR /&gt;And after that use &lt;A href="https://stabrise.com/scaledp/" target="_self"&gt;Scale DP library&lt;/A&gt; for extract data from the text in declarative way using LLM. Here is example of extraction data from the scanned receipts:&lt;BR /&gt;&lt;BR /&gt;&lt;/P&gt;&lt;DIV&gt;&lt;PRE&gt;&lt;SPAN&gt;class &lt;/SPAN&gt;&lt;SPAN&gt;ReceiptSchema&lt;/SPAN&gt;(BaseModel):&lt;BR /&gt;    &lt;SPAN&gt;"""Receipt."""&lt;BR /&gt;&lt;/SPAN&gt;&lt;SPAN&gt;&lt;BR /&gt;&lt;/SPAN&gt;    company_name: &lt;SPAN&gt;str&lt;BR /&gt;&lt;/SPAN&gt;    shop_name: &lt;SPAN&gt;str&lt;BR /&gt;&lt;/SPAN&gt;    company_type: &lt;SPAN&gt;CompanyType &lt;/SPAN&gt;= Field(&lt;BR /&gt;        &lt;SPAN&gt;description&lt;/SPAN&gt;=&lt;SPAN&gt;"Type of the company."&lt;/SPAN&gt;,&lt;BR /&gt;        &lt;SPAN&gt;examples&lt;/SPAN&gt;=[&lt;SPAN&gt;"MARKET"&lt;/SPAN&gt;, &lt;SPAN&gt;"PHARMACY"&lt;/SPAN&gt;],&lt;BR /&gt;    )&lt;BR /&gt;    address: &lt;SPAN&gt;Address&lt;BR /&gt;&lt;/SPAN&gt;    tax_id: &lt;SPAN&gt;str&lt;BR /&gt;&lt;/SPAN&gt;    transaction_date: &lt;SPAN&gt;date &lt;/SPAN&gt;= Field(&lt;SPAN&gt;description&lt;/SPAN&gt;=&lt;SPAN&gt;"Date of the transaction"&lt;/SPAN&gt;)&lt;BR /&gt;    transaction_time: &lt;SPAN&gt;time &lt;/SPAN&gt;= Field(&lt;SPAN&gt;description&lt;/SPAN&gt;=&lt;SPAN&gt;"Time of the transaction"&lt;/SPAN&gt;)&lt;BR /&gt;    total_amount: &lt;SPAN&gt;float&lt;BR /&gt;&lt;/SPAN&gt;    items: &lt;SPAN&gt;list&lt;/SPAN&gt;[ReceiptItem]&lt;/PRE&gt;&lt;DIV&gt;&lt;PRE&gt;&lt;SPAN&gt;extractor &lt;/SPAN&gt;= LLMExtractor(&lt;SPAN&gt;model&lt;/SPAN&gt;=&lt;SPAN&gt;"gemini-1.5-flash"&lt;/SPAN&gt;, &lt;SPAN&gt;schema&lt;/SPAN&gt;=ReceiptSchema, inputCol="text")&lt;BR /&gt;&lt;SPAN&gt;extractor&lt;/SPAN&gt;.transform(input_df)&lt;/PRE&gt;&lt;P&gt;And us result you will have json:&lt;/P&gt;&lt;/DIV&gt;&lt;/DIV&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;LI-CODE lang="markup"&gt;{
    "company_name": "ROSHEN",
    "shop_name": "TOM HRA",
    "address": "м, Вінниця, Вул, Келецька, /B B",
    "tax_id": "228739826104",
    "transaction_date": "23-10-2824 20:15:52",
    "total_amount": 328.06,
    "items": [
        {
            "name": "Шоколад чорний Brut 805",
            "quantity": 1.0,
            "price_per_unit": 46.31,
            "hko": "4823677632570",
            "price": 46.31
        },
        {
            "name": "Шоколад чорний Brut 80%",
            "quantity": 1.0,
            "price_per_unit": 46.31,
            "hko": "4893877632570",
            "price": 46.31
        },
        {
            "name": "Шоколад чорний Special",
            "quantity": 5.0,
            "price_per_unit": 33.84,
            "hko": "4803077632563",
            "price": 169.2
        },
        {
            "name": "Карамель LolliPops 3 ko",
            "quantity": 8.0,
            "price_per_unit": 18.51,
            "hko": "150",
            "price": 148.08
        },
        {
            "name": "Валі Wafers горіх 216r",
            "quantity": 1.0,
            "price_per_unit": 29.17,
            "hko": "4823877625626",
            "price": 29.17
        }
    ]
}&lt;/LI-CODE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&lt;BR /&gt;&lt;BR /&gt;&lt;/P&gt;&lt;P&gt;&lt;BR /&gt;&lt;BR /&gt;&lt;BR /&gt;&lt;/P&gt;</description>
      <pubDate>Sun, 02 Feb 2025 17:33:00 GMT</pubDate>
      <guid>https://community.databricks.com/t5/get-started-discussions/gathering-data-off-of-a-pdf-file/m-p/108457#M9726</guid>
      <dc:creator>Mykola_Melnyk</dc:creator>
      <dc:date>2025-02-02T17:33:00Z</dc:date>
    </item>
  </channel>
</rss>

