<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic ai_parse_document Not Extracting Text from Images in PDF in Machine Learning</title>
    <link>https://community.databricks.com/t5/machine-learning/ai-parse-document-not-extracting-text-from-images-in-pdf/m-p/141096#M4455</link>
    <description>&lt;P&gt;Hello Team,&lt;/P&gt;&lt;P&gt;I hope you are doing well.&lt;/P&gt;&lt;P&gt;I am a student currently exploring Databricks and learning how to work with the "ai parse document" function. While experimenting, I encountered a couple of issues related to text extraction from images inside PDF files. I wanted to share the details along with the code snippets I used.&lt;/P&gt;&lt;H3&gt;&lt;STRONG&gt;1. Text not extracted from all images in a PDF&lt;/STRONG&gt;&lt;/H3&gt;&lt;P&gt;I tested a PDF that contains &lt;STRONG&gt;two images&lt;/STRONG&gt;, and each image has text inside it.&lt;BR /&gt;However, "ai parse document" extracts text from &lt;STRONG&gt;only one&lt;/STRONG&gt; of the images.&lt;BR /&gt;The text from the second image is not extracted at all.&lt;/P&gt;&lt;HR /&gt;&lt;H3&gt;&lt;STRONG&gt;2. Images ignored in PDFs containing images + paragraphs&lt;/STRONG&gt;&lt;/H3&gt;&lt;P&gt;In another PDF containing both &lt;STRONG&gt;paragraph text&lt;/STRONG&gt; and &lt;STRONG&gt;multiple images&lt;/STRONG&gt;, the function extracts the paragraph text correctly, but &lt;STRONG&gt;no text is extracted from images&lt;/STRONG&gt;.&lt;/P&gt;&lt;H3&gt;&lt;STRONG&gt;Code Snippet Used&lt;/STRONG&gt;&lt;/H3&gt;&lt;LI-CODE lang="python"&gt;%sql
WITH parsed_documents AS (
    SELECT
      path,
      ai_parse_document(
        content,
        map(
          'imageOutputPath', '/Volumes/demo_raj_cat/demo_schema_cat/demo_volume_cat/demo_dir_cat/',
          'descriptionElementTypes', '*'
        )
      ) AS parsed
    FROM READ_FILES('/Volumes/demo_raj_cat/demo_schema_cat/demo_volume_cat/demo_dir_cat/pdf_with_two_images_part4.pdf', format =&amp;gt; 'binaryFile')
  ),
  parsed_text AS (
    SELECT
      path,
      concat_ws(
        '\n\n',
        transform(
          try_cast(parsed:document:elements AS ARRAY&amp;lt;STRING&amp;gt;),
          element -&amp;gt; try_cast(element:content AS STRING)
        )
      ) AS text
    FROM parsed_documents
    WHERE try_cast(parsed:error_status AS STRING) IS NULL
  )
  SELECT
    path,
    text,
    ai_query(
      'databricks-meta-llama-3-3-70b-instruct',
      concat(
        'Extract the following information from the document  ',
        text
      ),
      returnType =&amp;gt; 'STRING'
    ) AS structured_data
  FROM parsed_text
  WHERE text IS NOT NULL;&lt;/LI-CODE&gt;&lt;HR /&gt;&lt;H3&gt;&lt;STRONG&gt;Attachments&lt;/STRONG&gt;&lt;/H3&gt;&lt;P&gt;I have also attached the PDF files used for testing.&lt;/P&gt;&lt;P&gt;I kindly request your guidance on why text inside images is not being fully extracted and whether there are additional configurations needed.&lt;/P&gt;&lt;P&gt;Thank you very much for your support.&lt;/P&gt;&lt;P&gt;Warm regards,&lt;BR /&gt;&lt;STRONG&gt;Raj&lt;/STRONG&gt;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
    <pubDate>Thu, 04 Dec 2025 06:04:23 GMT</pubDate>
    <dc:creator>rajcoder</dc:creator>
    <dc:date>2025-12-04T06:04:23Z</dc:date>
    <item>
      <title>ai_parse_document Not Extracting Text from Images in PDF</title>
      <link>https://community.databricks.com/t5/machine-learning/ai-parse-document-not-extracting-text-from-images-in-pdf/m-p/141096#M4455</link>
      <description>&lt;P&gt;Hello Team,&lt;/P&gt;&lt;P&gt;I hope you are doing well.&lt;/P&gt;&lt;P&gt;I am a student currently exploring Databricks and learning how to work with the "ai parse document" function. While experimenting, I encountered a couple of issues related to text extraction from images inside PDF files. I wanted to share the details along with the code snippets I used.&lt;/P&gt;&lt;H3&gt;&lt;STRONG&gt;1. Text not extracted from all images in a PDF&lt;/STRONG&gt;&lt;/H3&gt;&lt;P&gt;I tested a PDF that contains &lt;STRONG&gt;two images&lt;/STRONG&gt;, and each image has text inside it.&lt;BR /&gt;However, "ai parse document" extracts text from &lt;STRONG&gt;only one&lt;/STRONG&gt; of the images.&lt;BR /&gt;The text from the second image is not extracted at all.&lt;/P&gt;&lt;HR /&gt;&lt;H3&gt;&lt;STRONG&gt;2. Images ignored in PDFs containing images + paragraphs&lt;/STRONG&gt;&lt;/H3&gt;&lt;P&gt;In another PDF containing both &lt;STRONG&gt;paragraph text&lt;/STRONG&gt; and &lt;STRONG&gt;multiple images&lt;/STRONG&gt;, the function extracts the paragraph text correctly, but &lt;STRONG&gt;no text is extracted from images&lt;/STRONG&gt;.&lt;/P&gt;&lt;H3&gt;&lt;STRONG&gt;Code Snippet Used&lt;/STRONG&gt;&lt;/H3&gt;&lt;LI-CODE lang="python"&gt;%sql
WITH parsed_documents AS (
    SELECT
      path,
      ai_parse_document(
        content,
        map(
          'imageOutputPath', '/Volumes/demo_raj_cat/demo_schema_cat/demo_volume_cat/demo_dir_cat/',
          'descriptionElementTypes', '*'
        )
      ) AS parsed
    FROM READ_FILES('/Volumes/demo_raj_cat/demo_schema_cat/demo_volume_cat/demo_dir_cat/pdf_with_two_images_part4.pdf', format =&amp;gt; 'binaryFile')
  ),
  parsed_text AS (
    SELECT
      path,
      concat_ws(
        '\n\n',
        transform(
          try_cast(parsed:document:elements AS ARRAY&amp;lt;STRING&amp;gt;),
          element -&amp;gt; try_cast(element:content AS STRING)
        )
      ) AS text
    FROM parsed_documents
    WHERE try_cast(parsed:error_status AS STRING) IS NULL
  )
  SELECT
    path,
    text,
    ai_query(
      'databricks-meta-llama-3-3-70b-instruct',
      concat(
        'Extract the following information from the document  ',
        text
      ),
      returnType =&amp;gt; 'STRING'
    ) AS structured_data
  FROM parsed_text
  WHERE text IS NOT NULL;&lt;/LI-CODE&gt;&lt;HR /&gt;&lt;H3&gt;&lt;STRONG&gt;Attachments&lt;/STRONG&gt;&lt;/H3&gt;&lt;P&gt;I have also attached the PDF files used for testing.&lt;/P&gt;&lt;P&gt;I kindly request your guidance on why text inside images is not being fully extracted and whether there are additional configurations needed.&lt;/P&gt;&lt;P&gt;Thank you very much for your support.&lt;/P&gt;&lt;P&gt;Warm regards,&lt;BR /&gt;&lt;STRONG&gt;Raj&lt;/STRONG&gt;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Thu, 04 Dec 2025 06:04:23 GMT</pubDate>
      <guid>https://community.databricks.com/t5/machine-learning/ai-parse-document-not-extracting-text-from-images-in-pdf/m-p/141096#M4455</guid>
      <dc:creator>rajcoder</dc:creator>
      <dc:date>2025-12-04T06:04:23Z</dc:date>
    </item>
    <item>
      <title>Re: ai_parse_document Not Extracting Text from Images in PDF</title>
      <link>https://community.databricks.com/t5/machine-learning/ai-parse-document-not-extracting-text-from-images-in-pdf/m-p/141130#M4457</link>
      <description>&lt;P&gt;Hello&amp;nbsp;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/199725"&gt;@rajcoder&lt;/a&gt;!&lt;/P&gt;
&lt;P&gt;This post appears to duplicate the one you recently posted. A response has already been provided to your &lt;A href="https://community.databricks.com/t5/generative-ai/issue-with-ai-parse-document-not-extracting-text-from-images-in/td-p/141099" target="_blank"&gt;recent post&lt;/A&gt;. I recommend continuing the discussion in that thread to keep the conversation focused and organised.&lt;/P&gt;</description>
      <pubDate>Thu, 04 Dec 2025 10:12:42 GMT</pubDate>
      <guid>https://community.databricks.com/t5/machine-learning/ai-parse-document-not-extracting-text-from-images-in-pdf/m-p/141130#M4457</guid>
      <dc:creator>Advika</dc:creator>
      <dc:date>2025-12-04T10:12:42Z</dc:date>
    </item>
  </channel>
</rss>

