<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Issue with complex json based data frame select in Data Engineering</title>
    <link>https://community.databricks.com/t5/data-engineering/issue-with-complex-json-based-data-frame-select/m-p/25124#M17431</link>
    <description>&lt;P&gt;We are getting the below error when trying to select the nested columns (string type in a struct) even though we don't have more than a 1000 records in the data frame. The schema is very complex and has few columns as struct type and few as array type (not selected for processing). We are using Spark 2.4.5 for processing. Please share us inputs on how we can resolve this issue.&lt;/P&gt;&lt;PRE&gt;&lt;CODE&gt;Py4JJavaError: An error occurred while calling o18602.collectToPythonFile.
: java.lang.StringIndexOutOfBoundsException: String index out of range: 2147483647
	at java.lang.String.charAt(String.java:658)
	at scala.collection.immutable.StringOps$.apply$extension(StringOps.scala:37)
	at org.apache.af.a(af.java)
	at org.apache.af.apply(af.java)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.bm.b(bm.java)
	at org.apache.ar.a(ar.java)
	at org.apache.ar.apply(ar.java)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.bm.a(bm.java)
	at org.apache.bm.b(bm.java)
	at org.apache.bm.apply(bm.java)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$execute$1$$anonfun$apply$1.apply(RuleExecutor.scala:112)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$execute$1$$anonfun$apply$1.apply(RuleExecutor.scala:109)
	at scala.collection.LinearSeqOptimized$class.foldLeft(LinearSeqOptimized.scala:124)
	at scala.collection.immutable.List.foldLeft(List.scala:84)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$execute$1.apply(RuleExecutor.scala:109)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$execute$1.apply(RuleExecutor.scala:101)
	at scala.collection.immutable.List.foreach(List.scala:392)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor.execute(RuleExecutor.scala:101)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$executeAndTrack$1.apply(RuleExecutor.scala:80)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$executeAndTrack$1.apply(RuleExecutor.scala:80)
	at org.apache.spark.sql.catalyst.QueryPlanningTracker$.withTracker(QueryPlanningTracker.scala:88)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor.executeAndTrack(RuleExecutor.scala:79)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$optimizedPlan$1.apply(QueryExecution.scala:96)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$optimizedPlan$1.apply(QueryExecution.scala:96)
	at org.apache.spark.sql.catalyst.QueryPlanningTracker.measurePhase(QueryPlanningTracker.scala:111)
	at org.apache.spark.sql.execution.QueryExecution.optimizedPlan$lzycompute(QueryExecution.scala:95)
	at org.apache.spark.sql.execution.QueryExecution.optimizedPlan(QueryExecution.scala:95)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$toString$2.apply(QueryExecution.scala:248)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$toString$2.apply(QueryExecution.scala:248)
	at org.apache.spark.sql.execution.QueryExecution.stringOrError(QueryExecution.scala:132)
	at org.apache.spark.sql.execution.QueryExecution.toString(QueryExecution.scala:248)
	at org.apache.spark.sql.execution.SQLExecution$$anonfun$withCustomExecutionEnv$1.apply(SQLExecution.scala:104)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:243)
	at org.apache.spark.sql.execution.SQLExecution$.withCustomExecutionEnv(SQLExecution.scala:99)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:173)
	at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$withAction(Dataset.scala:3487)
	at org.apache.spark.sql.Dataset$$anonfun$collectToPythonFile$1.apply(Dataset.scala:3373)
	at org.apache.spark.sql.Dataset$$anonfun$collectToPythonFile$1.apply(Dataset.scala:3372)
	at org.apache.spark.api.python.PythonSecurityUtils$$anonfun$withSafePythonFileForUser$2.apply(PythonSecurityUtils.scala:290)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1541)
	at org.apache.spark.api.python.PythonSecurityUtils$.withSafePythonFileForUser(PythonSecurityUtils.scala:302)
	at org.apache.spark.sql.Dataset.collectToPythonFile(Dataset.scala:3372)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:380)
	at py4j.Gateway.invoke(Gateway.java:295)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:251)
	at java.lang.Thread.run(Thread.java:748)&lt;/CODE&gt;&lt;/PRE&gt;&lt;P&gt;&lt;/P&gt;</description>
    <pubDate>Fri, 18 Mar 2022 21:58:04 GMT</pubDate>
    <dc:creator>cmotla</dc:creator>
    <dc:date>2022-03-18T21:58:04Z</dc:date>
    <item>
      <title>Issue with complex json based data frame select</title>
      <link>https://community.databricks.com/t5/data-engineering/issue-with-complex-json-based-data-frame-select/m-p/25124#M17431</link>
      <description>&lt;P&gt;We are getting the below error when trying to select the nested columns (string type in a struct) even though we don't have more than a 1000 records in the data frame. The schema is very complex and has few columns as struct type and few as array type (not selected for processing). We are using Spark 2.4.5 for processing. Please share us inputs on how we can resolve this issue.&lt;/P&gt;&lt;PRE&gt;&lt;CODE&gt;Py4JJavaError: An error occurred while calling o18602.collectToPythonFile.
: java.lang.StringIndexOutOfBoundsException: String index out of range: 2147483647
	at java.lang.String.charAt(String.java:658)
	at scala.collection.immutable.StringOps$.apply$extension(StringOps.scala:37)
	at org.apache.af.a(af.java)
	at org.apache.af.apply(af.java)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.bm.b(bm.java)
	at org.apache.ar.a(ar.java)
	at org.apache.ar.apply(ar.java)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.bm.a(bm.java)
	at org.apache.bm.b(bm.java)
	at org.apache.bm.apply(bm.java)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$execute$1$$anonfun$apply$1.apply(RuleExecutor.scala:112)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$execute$1$$anonfun$apply$1.apply(RuleExecutor.scala:109)
	at scala.collection.LinearSeqOptimized$class.foldLeft(LinearSeqOptimized.scala:124)
	at scala.collection.immutable.List.foldLeft(List.scala:84)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$execute$1.apply(RuleExecutor.scala:109)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$execute$1.apply(RuleExecutor.scala:101)
	at scala.collection.immutable.List.foreach(List.scala:392)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor.execute(RuleExecutor.scala:101)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$executeAndTrack$1.apply(RuleExecutor.scala:80)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$executeAndTrack$1.apply(RuleExecutor.scala:80)
	at org.apache.spark.sql.catalyst.QueryPlanningTracker$.withTracker(QueryPlanningTracker.scala:88)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor.executeAndTrack(RuleExecutor.scala:79)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$optimizedPlan$1.apply(QueryExecution.scala:96)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$optimizedPlan$1.apply(QueryExecution.scala:96)
	at org.apache.spark.sql.catalyst.QueryPlanningTracker.measurePhase(QueryPlanningTracker.scala:111)
	at org.apache.spark.sql.execution.QueryExecution.optimizedPlan$lzycompute(QueryExecution.scala:95)
	at org.apache.spark.sql.execution.QueryExecution.optimizedPlan(QueryExecution.scala:95)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$toString$2.apply(QueryExecution.scala:248)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$toString$2.apply(QueryExecution.scala:248)
	at org.apache.spark.sql.execution.QueryExecution.stringOrError(QueryExecution.scala:132)
	at org.apache.spark.sql.execution.QueryExecution.toString(QueryExecution.scala:248)
	at org.apache.spark.sql.execution.SQLExecution$$anonfun$withCustomExecutionEnv$1.apply(SQLExecution.scala:104)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:243)
	at org.apache.spark.sql.execution.SQLExecution$.withCustomExecutionEnv(SQLExecution.scala:99)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:173)
	at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$withAction(Dataset.scala:3487)
	at org.apache.spark.sql.Dataset$$anonfun$collectToPythonFile$1.apply(Dataset.scala:3373)
	at org.apache.spark.sql.Dataset$$anonfun$collectToPythonFile$1.apply(Dataset.scala:3372)
	at org.apache.spark.api.python.PythonSecurityUtils$$anonfun$withSafePythonFileForUser$2.apply(PythonSecurityUtils.scala:290)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1541)
	at org.apache.spark.api.python.PythonSecurityUtils$.withSafePythonFileForUser(PythonSecurityUtils.scala:302)
	at org.apache.spark.sql.Dataset.collectToPythonFile(Dataset.scala:3372)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:380)
	at py4j.Gateway.invoke(Gateway.java:295)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:251)
	at java.lang.Thread.run(Thread.java:748)&lt;/CODE&gt;&lt;/PRE&gt;&lt;P&gt;&lt;/P&gt;</description>
      <pubDate>Fri, 18 Mar 2022 21:58:04 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/issue-with-complex-json-based-data-frame-select/m-p/25124#M17431</guid>
      <dc:creator>cmotla</dc:creator>
      <dc:date>2022-03-18T21:58:04Z</dc:date>
    </item>
    <item>
      <title>Re: Issue with complex json based data frame select</title>
      <link>https://community.databricks.com/t5/data-engineering/issue-with-complex-json-based-data-frame-select/m-p/25125#M17432</link>
      <description>&lt;P&gt;Please share your code and some example of data.&lt;/P&gt;</description>
      <pubDate>Sun, 20 Mar 2022 14:43:02 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/issue-with-complex-json-based-data-frame-select/m-p/25125#M17432</guid>
      <dc:creator>Hubert-Dudek</dc:creator>
      <dc:date>2022-03-20T14:43:02Z</dc:date>
    </item>
  </channel>
</rss>

