<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Re: Issue while reading external iceberg table from GCS path using spark SQL in Data Engineering</title>
    <link>https://community.databricks.com/t5/data-engineering/issue-while-reading-external-iceberg-table-from-gcs-path-using/m-p/114586#M44876</link>
    <description>&lt;P&gt;Similar issue exist for Azure as well &lt;A href="https://github.com/apache/iceberg/issues/10808#issuecomment-2263673628" target="_blank"&gt;https://github.com/apache/iceberg/issues/10808#issuecomment-2263673628&lt;/A&gt;&lt;/P&gt;&lt;P&gt;Can this be fixed at databricks level.&lt;/P&gt;</description>
    <pubDate>Sat, 05 Apr 2025 11:06:05 GMT</pubDate>
    <dc:creator>Arvind007</dc:creator>
    <dc:date>2025-04-05T11:06:05Z</dc:date>
    <item>
      <title>Issue while reading external iceberg table from GCS path using spark SQL</title>
      <link>https://community.databricks.com/t5/data-engineering/issue-while-reading-external-iceberg-table-from-gcs-path-using/m-p/114406#M44810</link>
      <description>&lt;DIV&gt;&lt;DIV class=""&gt;&lt;DIV class=""&gt;&lt;DIV class=""&gt;&lt;DIV class=""&gt;&lt;DIV class=""&gt;&lt;DIV class=""&gt;&lt;DIV class=""&gt;&lt;DIV&gt;&lt;DIV class=""&gt;&lt;DIV&gt;&lt;DIV class=""&gt;&lt;DIV class=""&gt;&lt;DIV class=""&gt;&lt;DIV class=""&gt;&lt;DIV class=""&gt;&lt;DIV&gt;&amp;nbsp;&lt;/DIV&gt;&lt;/DIV&gt;&lt;/DIV&gt;&lt;/DIV&gt;&lt;/DIV&gt;&lt;/DIV&gt;&lt;/DIV&gt;&lt;/DIV&gt;&lt;/DIV&gt;&lt;/DIV&gt;&lt;/DIV&gt;&lt;/DIV&gt;&lt;/DIV&gt;&lt;/DIV&gt;&lt;/DIV&gt;&lt;/DIV&gt;&lt;/DIV&gt;&lt;LI-CODE lang="markup"&gt;df = spark.sql("select * from bqms_table;");
df.show();&lt;/LI-CODE&gt;&lt;DIV&gt;&lt;DIV class=""&gt;&lt;DIV class=""&gt;&lt;DIV class=""&gt;&lt;DIV class=""&gt;&lt;DIV&gt;ENV - DBRT&amp;nbsp;&lt;SPAN&gt;16.3 (includes Apache Spark 3.5.2, Scala 2.12)&lt;/SPAN&gt;&lt;/DIV&gt;&lt;DIV&gt;&lt;SPAN&gt;&lt;A target="_blank"&gt;org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.5.1&lt;/A&gt;&lt;/SPAN&gt;&lt;/DIV&gt;&lt;DIV&gt;&amp;nbsp;&lt;/DIV&gt;&lt;DIV&gt;&lt;DIV class=""&gt;&lt;DIV class=""&gt;Py4JJavaError: An error occurred while calling o471.showString. : org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 1.0 failed 4 times, most recent failure: Lost task 0.3 in stage 1.0 (TID 7) (10.0.0.17 executor driver): java.lang.UnsupportedOperationException: Byte-buffer read unsupported by com.databricks.common.filesystem.LokiGCSInputStream at org.apache.hadoop.fs.FSDataInputStream.read(FSDataInputStream.java:160) at com.databricks.spark.metrics.FSInputStreamWithMetrics.$anonfun$read$1(FileSystemWithMetrics.scala:77) at com.databricks.spark.metrics.FSInputStreamWithMetrics.withTimeAndBytesReadMetric(FileSystemWithMetrics.scala:67) at com.databricks.spark.metrics.FSInputStreamWithMetrics.read(FileSystemWithMetrics.scala:77) at org.apache.hadoop.fs.FSDataInputStream.read(FSDataInputStream.java:156) at org.apache.iceberg.shaded.org.apache.parquet.hadoop.util.H2SeekableInputStream$H2Reader.read(H2SeekableInputStream.java:89) at org.apache.iceberg.shaded.org.apache.parquet.hadoop.util.H2SeekableInputStream.readFully(H2SeekableInputStream.java:108) at org.apache.iceberg.shaded.org.apache.parquet.hadoop.util.H2SeekableInputStream.readFully(H2SeekableInputStream.java:83) at org.apache.iceberg.shaded.org.apache.parquet.hadoop.ParquetFileReader.readFooter(ParquetFileReader.java:622) at org.apache.iceberg.shaded.org.apache.parquet.hadoop.ParquetFileReader.&amp;lt;init&amp;gt;(ParquetFileReader.java:934) at org.apache.iceberg.shaded.org.apache.parquet.hadoop.ParquetFileReader.&amp;lt;init&amp;gt;(ParquetFileReader.java:925) at org.apache.iceberg.shaded.org.apache.parquet.hadoop.ParquetFileReader.open(ParquetFileReader.java:710) at org.apache.iceberg.parquet.ReadConf.newReader(ReadConf.java:194) at org.apache.iceberg.parquet.ReadConf.&amp;lt;init&amp;gt;(ReadConf.java:76) at org.apache.iceberg.parquet.VectorizedParquetReader.init(VectorizedParquetReader.java:90) at org.apache.iceberg.parquet.VectorizedParquetReader.iterator(VectorizedParquetReader.java:99) at org.apache.iceberg.spark.source.BatchDataReader.open(BatchDataReader.java:116) at org.apache.iceberg.spark.source.BatchDataReader.open(BatchDataReader.java:43) at org.apache.iceberg.spark.source.BaseReader.next(BaseReader.java:134) at org.apache.spark.sql.execution.datasources.v2.PartitionIterator.hasNext(DataSourceRDD.scala:122) at org.apache.spark.sql.execution.datasources.v2.MetricsIterator.hasNext(DataSourceRDD.scala:160) at org.apache.spark.sql.execution.datasources.v2.DataSourceRDD$$anon$1.$anonfun$hasNext$1(DataSourceRDD.scala:64) at org.apache.spark.sql.execution.datasources.v2.DataSourceRDD$$anon$1.$anonfun$hasNext$1$adapted(DataSourceRDD.scala:64) at scala.Option.exists(Option.scala:376) at org.apache.spark.sql.execution.datasources.v2.DataSourceRDD$$anon$1.hasNext(DataSourceRDD.scala:64) at org.apache.spark.sql.execution.datasources.v2.DataSourceRDD$$anon$1.advanceToNextIter(DataSourceRDD.scala:99) at org.apache.spark.sql.execution.datasources.v2.DataSourceRDD$$anon$1.hasNext(DataSourceRDD.scala:64) at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37) at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460) at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.columnartorow_nextBatch_0$(Unknown Source) at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source) at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) at org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.hasNext(WholeStageCodegenEvaluatorFactory.scala:50)&lt;/DIV&gt;&lt;/DIV&gt;&lt;/DIV&gt;&lt;/DIV&gt;&lt;/DIV&gt;&lt;/DIV&gt;&lt;/DIV&gt;&lt;/DIV&gt;</description>
      <pubDate>Thu, 03 Apr 2025 14:10:51 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/issue-while-reading-external-iceberg-table-from-gcs-path-using/m-p/114406#M44810</guid>
      <dc:creator>Arvind007</dc:creator>
      <dc:date>2025-04-03T14:10:51Z</dc:date>
    </item>
    <item>
      <title>Re: Issue while reading external iceberg table from GCS path using spark SQL</title>
      <link>https://community.databricks.com/t5/data-engineering/issue-while-reading-external-iceberg-table-from-gcs-path-using/m-p/114434#M44823</link>
      <description>&lt;P&gt;The error you're encountering is related to a compatibility issue between Databricks' GCS implementation and Apache Iceberg when trying to read Iceberg tables from Google Cloud Storage. The specific error is:&lt;/P&gt;
&lt;P&gt;```&lt;BR /&gt;java.lang.UnsupportedOperationException: Byte-buffer read unsupported by com.databricks.common.filesystem.LokiGCSInputStream&lt;BR /&gt;```&lt;/P&gt;
&lt;P&gt;This indicates that the Databricks GCS file system implementation (`LokiGCSInputStream`) doesn't support the byte-buffer read operations that Iceberg requires when reading Parquet files.&lt;/P&gt;
&lt;P&gt;Potential Solutions&lt;/P&gt;
&lt;P&gt;1. Use a Different FileIO Implementation&lt;/P&gt;
&lt;P&gt;You need to configure Iceberg to use a different FileIO implementation that's compatible with Databricks' GCS integration. Try setting the following configuration:&lt;/P&gt;
&lt;P&gt;```python&lt;BR /&gt;spark.conf.set("spark.sql.catalog.your_catalog_name.io-impl", "org.apache.iceberg.gcp.gcs.GCSFileIO")&lt;BR /&gt;```&lt;/P&gt;
&lt;P&gt;2. Update Catalog Configuration&lt;/P&gt;
&lt;P&gt;Ensure your catalog is properly configured with the correct GCS credentials and implementation:&lt;/P&gt;
&lt;P&gt;```python&lt;BR /&gt;Configure Iceberg catalog&lt;BR /&gt;spark.conf.set("spark.sql.catalog.your_catalog_name", "org.apache.iceberg.spark.SparkCatalog")&lt;BR /&gt;spark.conf.set("spark.sql.catalog.your_catalog_name.type", "hadoop")&lt;BR /&gt;spark.conf.set("spark.sql.catalog.your_catalog_name.warehouse", "gs://your-bucket/path")&lt;BR /&gt;spark.conf.set("spark.sql.catalog.your_catalog_name.io-impl", "org.apache.iceberg.gcp.gcs.GCSFileIO")&lt;BR /&gt;```&lt;/P&gt;
&lt;P&gt;3. Check Iceberg Version Compatibility&lt;/P&gt;
&lt;P&gt;The issue might be related to compatibility between Iceberg 1.5.1 and Databricks Runtime 16.3. Try using a different Iceberg version that's known to work with Databricks, such as 1.4.2:&lt;/P&gt;
&lt;P&gt;```python&lt;BR /&gt;Include in your spark configuration&lt;BR /&gt;spark.conf.set("spark.jars.packages", "org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.4.2,org.apache.iceberg:iceberg-gcp-bundle:1.4.2")&lt;BR /&gt;```&lt;/P&gt;
&lt;P&gt;4. Use Absolute Paths&lt;/P&gt;
&lt;P&gt;Iceberg requires absolute paths to locate metadata files and data files. Make sure you're using the full GCS path:&lt;/P&gt;
&lt;P&gt;```python&lt;BR /&gt;# Instead of using a table name reference&lt;BR /&gt;df = spark.sql("SELECT * FROM gs://your-bucket/path/to/table")&lt;BR /&gt;```&lt;/P&gt;
&lt;P&gt;5. Consider Using Unity Catalog&lt;/P&gt;
&lt;P&gt;If possible, consider using Databricks Unity Catalog with Iceberg reads enabled, which provides better integration:&lt;/P&gt;
&lt;P&gt;```sql&lt;BR /&gt;CREATE TABLE T(c1 INT) TBLPROPERTIES(&lt;BR /&gt;'delta.columnMapping.mode' = 'name',&lt;BR /&gt;'delta.enableIcebergCompatV2' = 'true',&lt;BR /&gt;'delta.universalFormat.enabledFormats' = 'iceberg'&lt;BR /&gt;);&lt;BR /&gt;```&lt;/P&gt;
&lt;P&gt;This is a known issue with Iceberg and certain file system implementations that don't support byte-buffer reads. The error occurs during the reading of Parquet file footers, which Iceberg uses to build its metadata model.&lt;/P&gt;</description>
      <pubDate>Thu, 03 Apr 2025 17:07:03 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/issue-while-reading-external-iceberg-table-from-gcs-path-using/m-p/114434#M44823</guid>
      <dc:creator>Louis_Frolio</dc:creator>
      <dc:date>2025-04-03T17:07:03Z</dc:date>
    </item>
    <item>
      <title>Re: Issue while reading external iceberg table from GCS path using spark SQL</title>
      <link>https://community.databricks.com/t5/data-engineering/issue-while-reading-external-iceberg-table-from-gcs-path-using/m-p/114586#M44876</link>
      <description>&lt;P&gt;Similar issue exist for Azure as well &lt;A href="https://github.com/apache/iceberg/issues/10808#issuecomment-2263673628" target="_blank"&gt;https://github.com/apache/iceberg/issues/10808#issuecomment-2263673628&lt;/A&gt;&lt;/P&gt;&lt;P&gt;Can this be fixed at databricks level.&lt;/P&gt;</description>
      <pubDate>Sat, 05 Apr 2025 11:06:05 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/issue-while-reading-external-iceberg-table-from-gcs-path-using/m-p/114586#M44876</guid>
      <dc:creator>Arvind007</dc:creator>
      <dc:date>2025-04-05T11:06:05Z</dc:date>
    </item>
    <item>
      <title>Re: Issue while reading external iceberg table from GCS path using spark SQL</title>
      <link>https://community.databricks.com/t5/data-engineering/issue-while-reading-external-iceberg-table-from-gcs-path-using/m-p/114590#M44877</link>
      <description>&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&lt;span class="lia-inline-image-display-wrapper lia-image-align-inline" image-alt="Arvind007_1-1743851403642.png" style="width: 400px;"&gt;&lt;img src="https://community.databricks.com/t5/image/serverpage/image-id/15830i0C86F37AB1B69C30/image-size/medium?v=v2&amp;amp;px=400" role="button" title="Arvind007_1-1743851403642.png" alt="Arvind007_1-1743851403642.png" /&gt;&lt;/span&gt;&lt;/P&gt;&lt;P&gt;&lt;span class="lia-inline-image-display-wrapper lia-image-align-inline" image-alt="Arvind007_0-1743851283272.png" style="width: 400px;"&gt;&lt;img src="https://community.databricks.com/t5/image/serverpage/image-id/15829iE896F094585DE5B7/image-size/medium?v=v2&amp;amp;px=400" role="button" title="Arvind007_0-1743851283272.png" alt="Arvind007_0-1743851283272.png" /&gt;&lt;/span&gt;&lt;/P&gt;&lt;P&gt;&lt;span class="lia-inline-image-display-wrapper lia-image-align-inline" image-alt="Arvind007_2-1743851437384.png" style="width: 400px;"&gt;&lt;img src="https://community.databricks.com/t5/image/serverpage/image-id/15831i8F245C2D6322FDA0/image-size/medium?v=v2&amp;amp;px=400" role="button" title="Arvind007_2-1743851437384.png" alt="Arvind007_2-1743851437384.png" /&gt;&lt;/span&gt;&lt;/P&gt;&lt;P&gt;I tried given solutions but it seems issue still persist. Appreciate if it can be resolved by Databricks soon for better integration b/w GCP and Databricks .&lt;/P&gt;</description>
      <pubDate>Sat, 05 Apr 2025 11:12:07 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/issue-while-reading-external-iceberg-table-from-gcs-path-using/m-p/114590#M44877</guid>
      <dc:creator>Arvind007</dc:creator>
      <dc:date>2025-04-05T11:12:07Z</dc:date>
    </item>
  </channel>
</rss>

