<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Re: Issue with ai_parse_document Not Extracting Text from Images in PDF in Generative AI</title>
    <link>https://community.databricks.com/t5/generative-ai/issue-with-ai-parse-document-not-extracting-text-from-images-in/m-p/141126#M1498</link>
    <description>&lt;P&gt;I explained how I processed PDFS in that article&amp;nbsp;&lt;A href="https://databrickster.medium.com/ai-parse-document-get-your-pdf-invoices-into-the-database-05565d3fa8a1" target="_blank"&gt;https://databrickster.medium.com/ai-parse-document-get-your-pdf-invoices-into-the-database-05565d3fa8a1&lt;/A&gt;&lt;/P&gt;</description>
    <pubDate>Thu, 04 Dec 2025 09:51:31 GMT</pubDate>
    <dc:creator>Hubert-Dudek</dc:creator>
    <dc:date>2025-12-04T09:51:31Z</dc:date>
    <item>
      <title>Issue with ai_parse_document Not Extracting Text from Images in PDF</title>
      <link>https://community.databricks.com/t5/generative-ai/issue-with-ai-parse-document-not-extracting-text-from-images-in/m-p/141099#M1495</link>
      <description>&lt;P&gt;Hello Team,&lt;/P&gt;&lt;P&gt;I hope you are doing well.&lt;/P&gt;&lt;P&gt;I am a student currently exploring Databricks and learning how to work with the "ai parse document" function. While experimenting, I encountered a couple of issues related to text extraction from images inside PDF files. I wanted to share the details along with the code snippets I used.&lt;/P&gt;&lt;H3 id="toc-hId-1423162654"&gt;&lt;STRONG&gt;1. Text not extracted from all images in a PDF&lt;/STRONG&gt;&lt;/H3&gt;&lt;P&gt;I tested a PDF that contains&lt;SPAN&gt;&amp;nbsp;&lt;/SPAN&gt;&lt;STRONG&gt;two images&lt;/STRONG&gt;, and each image has text inside it.&lt;BR /&gt;However, "ai parse document" extracts text from&lt;SPAN&gt;&amp;nbsp;&lt;/SPAN&gt;&lt;STRONG&gt;only one&lt;/STRONG&gt;&lt;SPAN&gt;&amp;nbsp;&lt;/SPAN&gt;of the images.&lt;BR /&gt;The text from the second image is not extracted at all.&lt;/P&gt;&lt;HR /&gt;&lt;H3 id="toc-hId--1128994307"&gt;&lt;STRONG&gt;2. Images ignored in PDFs containing images + paragraphs&lt;/STRONG&gt;&lt;/H3&gt;&lt;P&gt;In another PDF containing both&lt;SPAN&gt;&amp;nbsp;&lt;/SPAN&gt;&lt;STRONG&gt;paragraph text&lt;/STRONG&gt;&lt;SPAN&gt;&amp;nbsp;&lt;/SPAN&gt;and&lt;SPAN&gt;&amp;nbsp;&lt;/SPAN&gt;&lt;STRONG&gt;multiple images&lt;/STRONG&gt;, the function extracts the paragraph text correctly, but&lt;SPAN&gt;&amp;nbsp;&lt;/SPAN&gt;&lt;STRONG&gt;no text is extracted from images&lt;/STRONG&gt;.&lt;/P&gt;&lt;H3 id="toc-hId-613816028"&gt;&lt;STRONG&gt;Code Snippet Used&lt;/STRONG&gt;&lt;/H3&gt;&lt;LI-CODE lang="python"&gt;%sql
WITH parsed_documents AS (
    SELECT
      path,
      ai_parse_document(
        content,
        map(
          'imageOutputPath', '/Volumes/demo_raj_cat/demo_schema_cat/demo_volume_cat/demo_dir_cat/',
          'descriptionElementTypes', '*'
        )
      ) AS parsed
    FROM READ_FILES('/Volumes/demo_raj_cat/demo_schema_cat/demo_volume_cat/demo_dir_cat/pdf_with_two_images_part4.pdf', format =&amp;gt; 'binaryFile')
  ),
  parsed_text AS (
    SELECT
      path,
      concat_ws(
        '\n\n',
        transform(
          try_cast(parsed:document:elements AS ARRAY&amp;lt;STRING&amp;gt;),
          element -&amp;gt; try_cast(element:content AS STRING)
        )
      ) AS text
    FROM parsed_documents
    WHERE try_cast(parsed:error_status AS STRING) IS NULL
  )
  SELECT
    path,
    text,
    ai_query(
      'databricks-meta-llama-3-3-70b-instruct',
      concat(
        'Extract the following information from the document  ',
        text
      ),
      returnType =&amp;gt; 'STRING'
    ) AS structured_data
  FROM parsed_text
  WHERE text IS NOT NULL;&lt;/LI-CODE&gt;&lt;HR /&gt;&lt;H3 id="toc-hId--1938340933"&gt;&lt;STRONG&gt;Attachments&lt;/STRONG&gt;&lt;/H3&gt;&lt;P&gt;I have also attached the PDF files used for testing.&lt;/P&gt;&lt;P&gt;I kindly request your guidance on why text inside images is not being fully extracted and whether there are additional configurations needed.&lt;/P&gt;&lt;P&gt;Thank you very much for your support.&lt;/P&gt;&lt;P&gt;Warm regards,&lt;BR /&gt;&lt;STRONG&gt;Raj&lt;/STRONG&gt;&lt;/P&gt;</description>
      <pubDate>Thu, 04 Dec 2025 06:10:55 GMT</pubDate>
      <guid>https://community.databricks.com/t5/generative-ai/issue-with-ai-parse-document-not-extracting-text-from-images-in/m-p/141099#M1495</guid>
      <dc:creator>rajcoder</dc:creator>
      <dc:date>2025-12-04T06:10:55Z</dc:date>
    </item>
    <item>
      <title>Re: Issue with ai_parse_document Not Extracting Text from Images in PDF</title>
      <link>https://community.databricks.com/t5/generative-ai/issue-with-ai-parse-document-not-extracting-text-from-images-in/m-p/141126#M1498</link>
      <description>&lt;P&gt;I explained how I processed PDFS in that article&amp;nbsp;&lt;A href="https://databrickster.medium.com/ai-parse-document-get-your-pdf-invoices-into-the-database-05565d3fa8a1" target="_blank"&gt;https://databrickster.medium.com/ai-parse-document-get-your-pdf-invoices-into-the-database-05565d3fa8a1&lt;/A&gt;&lt;/P&gt;</description>
      <pubDate>Thu, 04 Dec 2025 09:51:31 GMT</pubDate>
      <guid>https://community.databricks.com/t5/generative-ai/issue-with-ai-parse-document-not-extracting-text-from-images-in/m-p/141126#M1498</guid>
      <dc:creator>Hubert-Dudek</dc:creator>
      <dc:date>2025-12-04T09:51:31Z</dc:date>
    </item>
    <item>
      <title>Re: Issue with ai_parse_document Not Extracting Text from Images in PDF</title>
      <link>https://community.databricks.com/t5/generative-ai/issue-with-ai-parse-document-not-extracting-text-from-images-in/m-p/141132#M1499</link>
      <description>&lt;P&gt;Thank you for your reply!&lt;/P&gt;&lt;P&gt;Yes, I have gone through your article — it explains very well how to extract text content from PDFs. However, I am facing a different issue.&lt;/P&gt;&lt;P&gt;In my case, the PDF contains &lt;STRONG&gt;multiple images and paragraphs&lt;/STRONG&gt;, but "ai_parse_document" is only able to extract the paragraph text. The images in the PDF (which also contain text inside them) are &lt;STRONG&gt;not being extracted or parsed at all&lt;/STRONG&gt;.&lt;/P&gt;&lt;P&gt;Just wanted to clarify that the issue is specifically related to handling &lt;STRONG&gt;images inside PDFs with text&lt;/STRONG&gt;, not regular PDF text extraction.&lt;/P&gt;&lt;P&gt;Thank you again for your guidance!&lt;/P&gt;</description>
      <pubDate>Thu, 04 Dec 2025 10:18:31 GMT</pubDate>
      <guid>https://community.databricks.com/t5/generative-ai/issue-with-ai-parse-document-not-extracting-text-from-images-in/m-p/141132#M1499</guid>
      <dc:creator>rajcoder</dc:creator>
      <dc:date>2025-12-04T10:18:31Z</dc:date>
    </item>
    <item>
      <title>Re: Issue with ai_parse_document Not Extracting Text from Images in PDF</title>
      <link>https://community.databricks.com/t5/generative-ai/issue-with-ai-parse-document-not-extracting-text-from-images-in/m-p/141136#M1500</link>
      <description>&lt;P&gt;Hi&amp;nbsp;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/199725"&gt;@rajcoder&lt;/a&gt;&amp;nbsp;,&lt;/P&gt;&lt;P&gt;It can happen. In theory it should work but keep in mind this feature is still on preview and has following limitations:&lt;/P&gt;&lt;P&gt;&lt;span class="lia-inline-image-display-wrapper lia-image-align-inline" image-alt="szymon_dybczak_0-1764844757658.png" style="width: 400px;"&gt;&lt;img src="https://community.databricks.com/t5/image/serverpage/image-id/22112i8336C019836CC712/image-size/medium?v=v2&amp;amp;px=400" role="button" title="szymon_dybczak_0-1764844757658.png" alt="szymon_dybczak_0-1764844757658.png" /&gt;&lt;/span&gt;&lt;/P&gt;&lt;P&gt;As you can see they've mentioned that sometimes function can ignore content (especially for documents that contain highly dense content or content with poor resoliution).&lt;/P&gt;&lt;P&gt;Moreover, there's nothing you can do to imporove that situation because customizing the model that powers this function is not supported.&lt;/P&gt;</description>
      <pubDate>Thu, 04 Dec 2025 10:42:02 GMT</pubDate>
      <guid>https://community.databricks.com/t5/generative-ai/issue-with-ai-parse-document-not-extracting-text-from-images-in/m-p/141136#M1500</guid>
      <dc:creator>szymon_dybczak</dc:creator>
      <dc:date>2025-12-04T10:42:02Z</dc:date>
    </item>
  </channel>
</rss>

