โ11-19-2022 12:03 AM
Hi,
I want to read an Excel file by:
filepath_xlsx = "dbfs:/FileStore/data.xlsx"
sampleDF = (spark.read.format("com.crealytics.spark.excel")
.option("Header", "true")
.option("inferSchema", "false")
.option("treatEmptyValuesAsNulls", "false")
.load(filepath_xlsx)
)
However, I get the error:
java.lang.NoClassDefFoundError: shadeio/poi/schemas/vmldrawing/XmlDocument
Py4JJavaError Traceback (most recent call last)
<command-3204971640072140> in <cell line: 2>()
1 # Read excel file
----> 2 sample1DF = (spark.read.format("com.crealytics.spark.excel")
3 .option("Header", "true")
4 .option("inferSchema", "false")
5 .option("treatEmptyValuesAsNulls", "false")
/databricks/spark/python/pyspark/instrumentation_utils.py in wrapper(*args, **kwargs)
46 start = time.perf_counter()
47 try:
---> 48 res = func(*args, **kwargs)
49 logger.log_success(
50 module_name, class_name, function_name, time.perf_counter() - start, signature
/databricks/spark/python/pyspark/sql/readwriter.py in load(self, path, format, schema, **options)
175 self.options(**options)
176 if isinstance(path, str):
--> 177 return self._df(self._jreader.load(path))
178 elif path is not None:
179 if type(path) != list:
/databricks/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/java_gateway.py in __call__(self, *args)
1319
1320 answer = self.gateway_client.send_command(command)
-> 1321 return_value = get_return_value(
1322 answer, self.gateway_client, self.target_id, self.name)
1323
/databricks/spark/python/pyspark/sql/utils.py in deco(*a, **kw)
194 def deco(*a: Any, **kw: Any) -> Any:
195 try:
--> 196 return f(*a, **kw)
197 except Py4JJavaError as e:
198 converted = convert_exception(e.java_exception)
/databricks/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
324 value = OUTPUT_CONVERTER[type](answer[2:], gateway_client)
325 if answer[1] == REFERENCE_TYPE:
--> 326 raise Py4JJavaError(
327 "An error occurred while calling {0}{1}{2}.\n".
328 format(target_id, ".", name), value)
Py4JJavaError: An error occurred while calling o918.load.
: java.lang.NoClassDefFoundError: shadeio/poi/schemas/vmldrawing/XmlDocument
at shadeio.poi.xssf.usermodel.XSSFVMLDrawing.read(XSSFVMLDrawing.java:135)
at shadeio.poi.xssf.usermodel.XSSFVMLDrawing.<init>(XSSFVMLDrawing.java:123)
at shadeio.poi.ooxml.POIXMLFactory.createDocumentPart(POIXMLFactory.java:61)
at shadeio.poi.ooxml.POIXMLDocumentPart.read(POIXMLDocumentPart.java:661)
at shadeio.poi.ooxml.POIXMLDocumentPart.read(POIXMLDocumentPart.java:678)
at shadeio.poi.ooxml.POIXMLDocument.load(POIXMLDocument.java:165)
at shadeio.poi.xssf.usermodel.XSSFWorkbook.<init>(XSSFWorkbook.java:274)
at shadeio.poi.xssf.usermodel.XSSFWorkbookFactory.createWorkbook(XSSFWorkbookFactory.java:118)
at shadeio.poi.xssf.usermodel.XSSFWorkbookFactory.create(XSSFWorkbookFactory.java:98)
at shadeio.poi.xssf.usermodel.XSSFWorkbookFactory.create(XSSFWorkbookFactory.java:36)
at shadeio.poi.ss.usermodel.WorkbookFactory.lambda$create$2(WorkbookFactory.java:224)
at shadeio.poi.ss.usermodel.WorkbookFactory.wp(WorkbookFactory.java:329)
at shadeio.poi.ss.usermodel.WorkbookFactory.create(WorkbookFactory.java:224)
at shadeio.poi.ss.usermodel.WorkbookFactory.create(WorkbookFactory.java:185)
at com.crealytics.spark.excel.DefaultWorkbookReader.$anonfun$openWorkbook$1(WorkbookReader.scala:55)
at scala.Option.fold(Option.scala:251)
at com.crealytics.spark.excel.DefaultWorkbookReader.openWorkbook(WorkbookReader.scala:55)
at com.crealytics.spark.excel.WorkbookReader.withWorkbook(WorkbookReader.scala:16)
at com.crealytics.spark.excel.WorkbookReader.withWorkbook$(WorkbookReader.scala:15)
at com.crealytics.spark.excel.DefaultWorkbookReader.withWorkbook(WorkbookReader.scala:50)
at com.crealytics.spark.excel.ExcelRelation.excerpt$lzycompute(ExcelRelation.scala:32)
at com.crealytics.spark.excel.ExcelRelation.excerpt(ExcelRelation.scala:32)
at com.crealytics.spark.excel.ExcelRelation.headerColumns$lzycompute(ExcelRelation.scala:104)
at com.crealytics.spark.excel.ExcelRelation.headerColumns(ExcelRelation.scala:103)
at com.crealytics.spark.excel.ExcelRelation.$anonfun$inferSchema$1(ExcelRelation.scala:172)
at scala.Option.getOrElse(Option.scala:189)
at com.crealytics.spark.excel.ExcelRelation.inferSchema(ExcelRelation.scala:171)
at com.crealytics.spark.excel.ExcelRelation.<init>(ExcelRelation.scala:36)
at com.crealytics.spark.excel.DefaultSource.createRelation(DefaultSource.scala:36)
at com.crealytics.spark.excel.DefaultSource.createRelation(DefaultSource.scala:13)
at com.crealytics.spark.excel.DefaultSource.createRelation(DefaultSource.scala:8)
at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:385)
at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:368)
at org.apache.spark.sql.DataFrameReader.$anonfun$load$2(DataFrameReader.scala:324)
at scala.Option.getOrElse(Option.scala:189)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:324)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:237)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:380)
at py4j.Gateway.invoke(Gateway.java:306)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:195)
at py4j.ClientServerConnection.run(ClientServerConnection.java:115)
at java.lang.Thread.run(Thread.java:750)
I installed "com.crealytics:spark-excel_2.12:3.2.1_0.16.4" on the cluster following this thread. Still getting error.
Environment:
Any idea to solve this issue?
โ11-19-2022 12:14 AM
Hi,
Which language did you use to execute that code? Is it ok if you share me you data file so that I can reproduce from my side.โ
โ11-19-2022 04:18 AM
Hi @Mohammad Saberโ ,
I reproduced it and it seems an error related to the spark-excel version.
Can you try to install the latest version: com.crealytics:spark-excel_2.12:3.3.1_0.18.5
This should fix the issue.
โ11-19-2022 04:54 AM
@davide.anghileri davide.anghileriโ
Thanks a lot. It worked.
I tried to read another Excel file (with several sheets & multi-row header), and this time I get the error:
org.apache.poi.ooxml.POIXMLException: Strict OOXML isn't currently supported, please see bug #57699
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
<command-496434324351845> in <cell line: 4>()
2
3 # Read excel file
----> 4 sampleDF_xlsx = (spark.read.format("com.crealytics.spark.excel")
5 .option("Header", "true")
6 .option("inferSchema", "false")
/databricks/spark/python/pyspark/instrumentation_utils.py in wrapper(*args, **kwargs)
46 start = time.perf_counter()
47 try:
---> 48 res = func(*args, **kwargs)
49 logger.log_success(
50 module_name, class_name, function_name, time.perf_counter() - start, signature
/databricks/spark/python/pyspark/sql/readwriter.py in load(self, path, format, schema, **options)
175 self.options(**options)
176 if isinstance(path, str):
--> 177 return self._df(self._jreader.load(path))
178 elif path is not None:
179 if type(path) != list:
/databricks/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/java_gateway.py in __call__(self, *args)
1319
1320 answer = self.gateway_client.send_command(command)
-> 1321 return_value = get_return_value(
1322 answer, self.gateway_client, self.target_id, self.name)
1323
/databricks/spark/python/pyspark/sql/utils.py in deco(*a, **kw)
194 def deco(*a: Any, **kw: Any) -> Any:
195 try:
--> 196 return f(*a, **kw)
197 except Py4JJavaError as e:
198 converted = convert_exception(e.java_exception)
/databricks/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
324 value = OUTPUT_CONVERTER[type](answer[2:], gateway_client)
325 if answer[1] == REFERENCE_TYPE:
--> 326 raise Py4JJavaError(
327 "An error occurred while calling {0}{1}{2}.\n".
328 format(target_id, ".", name), value)
Py4JJavaError: An error occurred while calling o634.load.
: org.apache.poi.ooxml.POIXMLException: Strict OOXML isn't currently supported, please see bug #57699
at org.apache.poi.ooxml.POIXMLDocumentPart.getPartFromOPCPackage(POIXMLDocumentPart.java:757)
at org.apache.poi.ooxml.POIXMLDocumentPart.<init>(POIXMLDocumentPart.java:151)
at org.apache.poi.ooxml.POIXMLDocumentPart.<init>(POIXMLDocumentPart.java:141)
at org.apache.poi.ooxml.POIXMLDocument.<init>(POIXMLDocument.java:60)
at org.apache.poi.xssf.usermodel.XSSFWorkbook.<init>(XSSFWorkbook.java:254)
at org.apache.poi.xssf.usermodel.XSSFWorkbookFactory.createWorkbook(XSSFWorkbookFactory.java:118)
at org.apache.poi.xssf.usermodel.XSSFWorkbookFactory.create(XSSFWorkbookFactory.java:98)
at org.apache.poi.xssf.usermodel.XSSFWorkbookFactory.create(XSSFWorkbookFactory.java:36)
at org.apache.poi.ss.usermodel.WorkbookFactory.lambda$create$2(WorkbookFactory.java:224)
at org.apache.poi.ss.usermodel.WorkbookFactory.wp(WorkbookFactory.java:329)
at org.apache.poi.ss.usermodel.WorkbookFactory.create(WorkbookFactory.java:224)
at org.apache.poi.ss.usermodel.WorkbookFactory.create(WorkbookFactory.java:185)
at com.crealytics.spark.excel.DefaultWorkbookReader.$anonfun$openWorkbook$3(WorkbookReader.scala:107)
at scala.Option.fold(Option.scala:251)
at com.crealytics.spark.excel.DefaultWorkbookReader.openWorkbook(WorkbookReader.scala:107)
at com.crealytics.spark.excel.WorkbookReader.withWorkbook(WorkbookReader.scala:34)
at com.crealytics.spark.excel.WorkbookReader.withWorkbook$(WorkbookReader.scala:33)
at com.crealytics.spark.excel.DefaultWorkbookReader.withWorkbook(WorkbookReader.scala:92)
at com.crealytics.spark.excel.ExcelRelation.excerpt$lzycompute(ExcelRelation.scala:48)
at com.crealytics.spark.excel.ExcelRelation.excerpt(ExcelRelation.scala:48)
at com.crealytics.spark.excel.ExcelRelation.headerColumns$lzycompute(ExcelRelation.scala:121)
at com.crealytics.spark.excel.ExcelRelation.headerColumns(ExcelRelation.scala:120)
at com.crealytics.spark.excel.ExcelRelation.$anonfun$inferSchema$1(ExcelRelation.scala:189)
at scala.Option.getOrElse(Option.scala:189)
at com.crealytics.spark.excel.ExcelRelation.inferSchema(ExcelRelation.scala:188)
at com.crealytics.spark.excel.ExcelRelation.<init>(ExcelRelation.scala:52)
at com.crealytics.spark.excel.DefaultSource.createRelation(DefaultSource.scala:52)
at com.crealytics.spark.excel.DefaultSource.createRelation(DefaultSource.scala:29)
at com.crealytics.spark.excel.DefaultSource.createRelation(DefaultSource.scala:24)
at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:385)
at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:368)
at org.apache.spark.sql.DataFrameReader.$anonfun$load$2(DataFrameReader.scala:324)
at scala.Option.getOrElse(Option.scala:189)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:324)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:237)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:380)
at py4j.Gateway.invoke(Gateway.java:306)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:195)
at py4j.ClientServerConnection.run(ClientServerConnection.java:115)
at java.lang.Thread.run(Thread.java:750)
โ11-19-2022 04:56 AM
For this dataset, I also tried binary file reading as below:
xldf_xlsx = (
spark.read.format("binaryFile")
.option("pathGlobFilter", "*.xls*")
.load(filepath_xlsx)
)
excel_content = xldf_xlsx.head(1)[0].content
file_like_obj = io.BytesIO(excel_content)
xl = pd.ExcelFile(file_like_obj, engine="openpyxl")
And the last line gives the error:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<command-496434324351842> in <cell line: 3>()
1 excel_content = xldf_xlsx.head(1)[0].content
2 file_like_obj = io.BytesIO(excel_content)
----> 3 xl = pd.ExcelFile(file_like_obj, engine="openpyxl")
/databricks/python/lib/python3.9/site-packages/pandas/io/excel/_base.py in __init__(self, path_or_buffer, engine, storage_options)
1231 self.storage_options = storage_options
1232
-> 1233 self._reader = self._engines[engine](self._io, storage_options=storage_options)
1234
1235 def __fspath__(self):
/databricks/python/lib/python3.9/site-packages/pandas/io/excel/_openpyxl.py in __init__(self, filepath_or_buffer, storage_options)
520 """
521 import_optional_dependency("openpyxl")
--> 522 super().__init__(filepath_or_buffer, storage_options=storage_options)
523
524 @property
/databricks/python/lib/python3.9/site-packages/pandas/io/excel/_base.py in __init__(self, filepath_or_buffer, storage_options)
418 self.handles.handle.seek(0)
419 try:
--> 420 self.book = self.load_workbook(self.handles.handle)
421 except Exception:
422 self.close()
/databricks/python/lib/python3.9/site-packages/pandas/io/excel/_openpyxl.py in load_workbook(self, filepath_or_buffer)
531 from openpyxl import load_workbook
532
--> 533 return load_workbook(
534 filepath_or_buffer, read_only=True, data_only=True, keep_links=False
535 )
/local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.9/site-packages/openpyxl/reader/excel.py in load_workbook(filename, read_only, keep_vba, data_only, keep_links)
315 reader = ExcelReader(filename, read_only, keep_vba,
316 data_only, keep_links)
--> 317 reader.read()
318 return reader.wb
/local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.9/site-packages/openpyxl/reader/excel.py in read(self)
276 self.read_manifest()
277 self.read_strings()
--> 278 self.read_workbook()
279 self.read_properties()
280 self.read_theme()
/local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.9/site-packages/openpyxl/reader/excel.py in read_workbook(self)
148 wb_part = _find_workbook_part(self.package)
149 self.parser = WorkbookParser(self.archive, wb_part.PartName[1:], keep_links=self.keep_links)
--> 150 self.parser.parse()
151 wb = self.parser.wb
152 wb._sheets = []
/local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.9/site-packages/openpyxl/reader/workbook.py in parse(self)
47 src = self.archive.read(self.workbook_part_name)
48 node = fromstring(src)
---> 49 package = WorkbookPackage.from_tree(node)
50 if package.properties.date1904:
51 self.wb.epoch = CALENDAR_MAC_1904
/local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.9/site-packages/openpyxl/descriptors/serialisable.py in from_tree(cls, node)
81 if hasattr(desc, 'from_tree'):
82 #descriptor manages conversion
---> 83 obj = desc.from_tree(el)
84 else:
85 if hasattr(desc.expected_type, "from_tree"):
/local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.9/site-packages/openpyxl/descriptors/sequence.py in from_tree(self, node)
83
84 def from_tree(self, node):
---> 85 return [self.expected_type.from_tree(el) for el in node]
86
87
/local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.9/site-packages/openpyxl/descriptors/sequence.py in <listcomp>(.0)
83
84 def from_tree(self, node):
---> 85 return [self.expected_type.from_tree(el) for el in node]
86
87
/local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.9/site-packages/openpyxl/descriptors/serialisable.py in from_tree(cls, node)
101 attrib[tag] = obj
102
--> 103 return cls(**attrib)
104
105
TypeError: __init__() missing 1 required positional argument: 'id'
Join a Regional User Group to connect with local Databricks users. Events will be happening in your city, and you wonโt want to miss the chance to attend and share knowledge.
If there isnโt a group near you, start one and help create a community that brings people together.
Request a New Group