<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Re: Failed to convert Spark.sql to Pandas Dataframe using .toPandas() in Data Engineering</title>
    <link>https://community.databricks.com/t5/data-engineering/failed-to-convert-spark-sql-to-pandas-dataframe-using-topandas/m-p/15095#M9461</link>
    <description>&lt;P&gt;@Prasad Wagh​&amp;nbsp;@Kaniz Fatma​&amp;nbsp;I use Standard_DS3_V2 (14GB Memory 4 Cores) in Azure Databricks. Originally, I set Min Worker: 1 Max Workers: 8. Now, Min Worker: 1 Max Worker: 8.&lt;/P&gt;&lt;P&gt;​&lt;/P&gt;&lt;P&gt;But this still doesn't work. 1 worker should be able to finish the task. The data size of the task is small.&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;I guess the size of the VM cluster is not the cause.&lt;/P&gt;</description>
    <pubDate>Fri, 08 Jul 2022 09:40:00 GMT</pubDate>
    <dc:creator>Dicer</dc:creator>
    <dc:date>2022-07-08T09:40:00Z</dc:date>
    <item>
      <title>Failed to convert Spark.sql to Pandas Dataframe using .toPandas()</title>
      <link>https://community.databricks.com/t5/data-engineering/failed-to-convert-spark-sql-to-pandas-dataframe-using-topandas/m-p/15089#M9455</link>
      <description>&lt;P&gt;I wrote the following code:&lt;/P&gt;&lt;P&gt;​&lt;/P&gt;&lt;PRE&gt;&lt;CODE&gt;data = spark.sql (" SELECT A_adjClose, AA_adjClose, AAL_adjClose, AAP_adjClose, AAPL_adjClose FROM deltabase.a_30min_delta, deltabase.aa_30min_delta, deltabase.aal_30min_delta, deltabase.aap_30min_delta ,deltabase.aapl_30min_delta ")
&amp;nbsp;
&amp;nbsp;
&amp;nbsp;
spark.conf.set("spark.sql.execution.arrow.enabled", "true")
&amp;nbsp;
&amp;nbsp;
&amp;nbsp;
#This part keeps running command
&amp;nbsp;
data_pd = data.toPandas()
&amp;nbsp;
&amp;nbsp;
&amp;nbsp;
df_pct = data_pd.pct_change(1)&lt;/CODE&gt;&lt;/PRE&gt;&lt;P&gt;​&lt;/P&gt;&lt;P&gt;The code stucks in .toPandas() this part. &lt;/P&gt;&lt;P&gt;​&lt;/P&gt;&lt;P&gt;​&lt;/P&gt;&lt;P&gt;​&lt;/P&gt;</description>
      <pubDate>Sat, 02 Jul 2022 11:27:46 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/failed-to-convert-spark-sql-to-pandas-dataframe-using-topandas/m-p/15089#M9455</guid>
      <dc:creator>Dicer</dc:creator>
      <dc:date>2022-07-02T11:27:46Z</dc:date>
    </item>
    <item>
      <title>Re: Failed to convert Spark.sql to Pandas Dataframe using .toPandas()</title>
      <link>https://community.databricks.com/t5/data-engineering/failed-to-convert-spark-sql-to-pandas-dataframe-using-topandas/m-p/15090#M9456</link>
      <description>&lt;P&gt;Btw, this is Azure Databricks&lt;/P&gt;</description>
      <pubDate>Sat, 02 Jul 2022 11:28:52 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/failed-to-convert-spark-sql-to-pandas-dataframe-using-topandas/m-p/15090#M9456</guid>
      <dc:creator>Dicer</dc:creator>
      <dc:date>2022-07-02T11:28:52Z</dc:date>
    </item>
    <item>
      <title>Re: Failed to convert Spark.sql to Pandas Dataframe using .toPandas()</title>
      <link>https://community.databricks.com/t5/data-engineering/failed-to-convert-spark-sql-to-pandas-dataframe-using-topandas/m-p/15091#M9457</link>
      <description>&lt;P&gt;Try to replace .to_pandas() with .to_pandas_on_spark(). This way, you will ensure that the dataframe is processed distributed among the workers.&lt;/P&gt;&lt;P&gt;&lt;/P&gt;</description>
      <pubDate>Sun, 03 Jul 2022 18:19:50 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/failed-to-convert-spark-sql-to-pandas-dataframe-using-topandas/m-p/15091#M9457</guid>
      <dc:creator>Hubert-Dudek</dc:creator>
      <dc:date>2022-07-03T18:19:50Z</dc:date>
    </item>
    <item>
      <title>Re: Failed to convert Spark.sql to Pandas Dataframe using .toPandas()</title>
      <link>https://community.databricks.com/t5/data-engineering/failed-to-convert-spark-sql-to-pandas-dataframe-using-topandas/m-p/15092#M9458</link>
      <description>&lt;P&gt;I tried to replace .to_pandas() with .to_pandas_on_spark(), but there were 1 warning message and 1 error message:&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;/databricks/spark/python/pyspark/sql/&lt;A href="https://dataframe.py" alt="https://dataframe.py" target="_blank"&gt;dataframe.py&lt;/A&gt;:3407: FutureWarning: DataFrame.to_pandas_on_spark is deprecated. Use DataFrame.pandas_api instead.&lt;/P&gt;&lt;P&gt; warnings.warn(&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;SparkException: Job aborted due to stage failure: Task 0 in stage 21.0 failed 4 times, most recent failure: Lost task 0.3 in stage 21.0 (TID 24) (10.139.64.4 executor 3): ExecutorLostFailure (executor 3 exited caused by one of the running tasks) Reason: Executor heartbeat timed out after 176959 ms&lt;/P&gt;</description>
      <pubDate>Mon, 04 Jul 2022 20:59:54 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/failed-to-convert-spark-sql-to-pandas-dataframe-using-topandas/m-p/15092#M9458</guid>
      <dc:creator>Dicer</dc:creator>
      <dc:date>2022-07-04T20:59:54Z</dc:date>
    </item>
    <item>
      <title>Re: Failed to convert Spark.sql to Pandas Dataframe using .toPandas()</title>
      <link>https://community.databricks.com/t5/data-engineering/failed-to-convert-spark-sql-to-pandas-dataframe-using-topandas/m-p/15093#M9459</link>
      <description>&lt;P&gt;Hi @Cheuk Hin Christophe Poon​&amp;nbsp; can you please try increasing executors if it helps&lt;/P&gt;</description>
      <pubDate>Thu, 07 Jul 2022 13:39:47 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/failed-to-convert-spark-sql-to-pandas-dataframe-using-topandas/m-p/15093#M9459</guid>
      <dc:creator>User16753725469</dc:creator>
      <dc:date>2022-07-07T13:39:47Z</dc:date>
    </item>
    <item>
      <title>Re: Failed to convert Spark.sql to Pandas Dataframe using .toPandas()</title>
      <link>https://community.databricks.com/t5/data-engineering/failed-to-convert-spark-sql-to-pandas-dataframe-using-topandas/m-p/15095#M9461</link>
      <description>&lt;P&gt;@Prasad Wagh​&amp;nbsp;@Kaniz Fatma​&amp;nbsp;I use Standard_DS3_V2 (14GB Memory 4 Cores) in Azure Databricks. Originally, I set Min Worker: 1 Max Workers: 8. Now, Min Worker: 1 Max Worker: 8.&lt;/P&gt;&lt;P&gt;​&lt;/P&gt;&lt;P&gt;But this still doesn't work. 1 worker should be able to finish the task. The data size of the task is small.&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;I guess the size of the VM cluster is not the cause.&lt;/P&gt;</description>
      <pubDate>Fri, 08 Jul 2022 09:40:00 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/failed-to-convert-spark-sql-to-pandas-dataframe-using-topandas/m-p/15095#M9461</guid>
      <dc:creator>Dicer</dc:creator>
      <dc:date>2022-07-08T09:40:00Z</dc:date>
    </item>
    <item>
      <title>Re: Failed to convert Spark.sql to Pandas Dataframe using .toPandas()</title>
      <link>https://community.databricks.com/t5/data-engineering/failed-to-convert-spark-sql-to-pandas-dataframe-using-topandas/m-p/15096#M9462</link>
      <description>&lt;P&gt;As Spark is using lazy evaluation, I bet that is not to_pands which case the issues but pct_change as stated in the quote from the documentation below. Also is better that panda datset has the unique  index to run pct_change (if you have no time field, you can set increment id):&lt;/P&gt;&lt;PRE&gt;&lt;CODE&gt;df.set_index('month')&lt;/CODE&gt;&lt;/PRE&gt;&lt;P&gt;&lt;I&gt;"the current implementation of this API uses Spark’s Window without specifying partition specification. This leads to move all data into single partition in single machine and could cause serious performance degradation. Avoid this method against very large dataset."&lt;/I&gt;&lt;/P&gt;</description>
      <pubDate>Fri, 08 Jul 2022 16:49:59 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/failed-to-convert-spark-sql-to-pandas-dataframe-using-topandas/m-p/15096#M9462</guid>
      <dc:creator>Hubert-Dudek</dc:creator>
      <dc:date>2022-07-08T16:49:59Z</dc:date>
    </item>
    <item>
      <title>Re: Failed to convert Spark.sql to Pandas Dataframe using .toPandas()</title>
      <link>https://community.databricks.com/t5/data-engineering/failed-to-convert-spark-sql-to-pandas-dataframe-using-topandas/m-p/15097#M9463</link>
      <description>&lt;PRE&gt;&lt;CODE&gt;data = spark.sql (" SELECT A_adjClose, AA_adjClose, AAL_adjClose, AAP_adjClose, AAPL_adjClose FROM deltabase.a_30min_delta, deltabase.aa_30min_delta, deltabase.aal_30min_delta, deltabase.aap_30min_delta ,deltabase.aapl_30min_delta ")
&amp;nbsp;
display(data)
&amp;nbsp;
spark.conf.set("spark.sql.execution.arrow.enabled", "true")
&amp;nbsp;
data_pd = data.toPandas()
&amp;nbsp;
#df_pct = data_pd.pct_change(1)
&amp;nbsp;
#display(df_pct)&lt;/CODE&gt;&lt;/PRE&gt;&lt;P&gt;@Hubert Dudek​&amp;nbsp;I don't think the problem is the data type. &lt;/P&gt;&lt;P&gt;​&lt;/P&gt;&lt;P&gt;In my original code, there was a date/time, but I am debugging now. And I realized removing the date/time column doesn't solve the problem.&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Now, the data types of the data are just float.&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Also, I removed pct_change. The problem still exists.&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;&lt;/P&gt;</description>
      <pubDate>Sat, 09 Jul 2022 03:51:04 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/failed-to-convert-spark-sql-to-pandas-dataframe-using-topandas/m-p/15097#M9463</guid>
      <dc:creator>Dicer</dc:creator>
      <dc:date>2022-07-09T03:51:04Z</dc:date>
    </item>
    <item>
      <title>Re: Failed to convert Spark.sql to Pandas Dataframe using .toPandas()</title>
      <link>https://community.databricks.com/t5/data-engineering/failed-to-convert-spark-sql-to-pandas-dataframe-using-topandas/m-p/15098#M9464</link>
      <description>&lt;P&gt;@Prasad Wagh​&amp;nbsp;@Kaniz Fatma​&amp;nbsp; Is it possible to submit a full detailed log report to Databricks?&lt;/P&gt;</description>
      <pubDate>Sat, 09 Jul 2022 04:09:47 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/failed-to-convert-spark-sql-to-pandas-dataframe-using-topandas/m-p/15098#M9464</guid>
      <dc:creator>Dicer</dc:creator>
      <dc:date>2022-07-09T04:09:47Z</dc:date>
    </item>
    <item>
      <title>Re: Failed to convert Spark.sql to Pandas Dataframe using .toPandas()</title>
      <link>https://community.databricks.com/t5/data-engineering/failed-to-convert-spark-sql-to-pandas-dataframe-using-topandas/m-p/15099#M9465</link>
      <description>&lt;UL&gt;&lt;LI&gt;How many rows does the dataset have?&lt;/LI&gt;&lt;LI&gt;Can you share excel from the display function with the sample?&lt;/LI&gt;&lt;LI&gt;What are the cluster-specific (worker type and runtime version)? Is it standard, high-concurrent, or single-machine?&lt;/LI&gt;&lt;/UL&gt;</description>
      <pubDate>Sat, 09 Jul 2022 12:14:12 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/failed-to-convert-spark-sql-to-pandas-dataframe-using-topandas/m-p/15099#M9465</guid>
      <dc:creator>Hubert-Dudek</dc:creator>
      <dc:date>2022-07-09T12:14:12Z</dc:date>
    </item>
    <item>
      <title>Re: Failed to convert Spark.sql to Pandas Dataframe using .toPandas()</title>
      <link>https://community.databricks.com/t5/data-engineering/failed-to-convert-spark-sql-to-pandas-dataframe-using-topandas/m-p/15100#M9466</link>
      <description>&lt;P&gt;&lt;/P&gt;&lt;UL&gt;&lt;LI&gt;Each Delta table has about more than 100,000 rows, but each Delta table only has about 3.18 MB.&lt;/LI&gt;&lt;/UL&gt;&lt;P&gt;&lt;/P&gt;&lt;UL&gt;&lt;LI&gt;I upload the photo and there is a sample, but not using the python display() function.&lt;/LI&gt;&lt;/UL&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;&lt;/P&gt;</description>
      <pubDate>Sat, 09 Jul 2022 17:04:04 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/failed-to-convert-spark-sql-to-pandas-dataframe-using-topandas/m-p/15100#M9466</guid>
      <dc:creator>Dicer</dc:creator>
      <dc:date>2022-07-09T17:04:04Z</dc:date>
    </item>
    <item>
      <title>Re: Failed to convert Spark.sql to Pandas Dataframe using .toPandas()</title>
      <link>https://community.databricks.com/t5/data-engineering/failed-to-convert-spark-sql-to-pandas-dataframe-using-topandas/m-p/15101#M9467</link>
      <description>&lt;UL&gt;&lt;LI&gt;Cluster Model: Standard. Runtime version: 11.0(Includes Apache Spark 3.3.0, Scala 2.12). Worker Type: Standard_DS3_V2. &lt;/LI&gt;&lt;/UL&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;​Spark Config:&lt;/P&gt;&lt;PRE&gt;&lt;CODE&gt;spark.databricks.delta.autoCompact.enabled true
spark.databricks.delta.optimizeWrite.enabled true&lt;/CODE&gt;&lt;/PRE&gt;&lt;P&gt;&lt;/P&gt;</description>
      <pubDate>Sat, 09 Jul 2022 17:07:48 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/failed-to-convert-spark-sql-to-pandas-dataframe-using-topandas/m-p/15101#M9467</guid>
      <dc:creator>Dicer</dc:creator>
      <dc:date>2022-07-09T17:07:48Z</dc:date>
    </item>
    <item>
      <title>Re: Failed to convert Spark.sql to Pandas Dataframe using .toPandas()</title>
      <link>https://community.databricks.com/t5/data-engineering/failed-to-convert-spark-sql-to-pandas-dataframe-using-topandas/m-p/15102#M9468</link>
      <description>&lt;P&gt;I just discovered a solution.&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Today, I opened Azure Databricks. When I imported python libraries. Databricks told me that toPandas() was deprecated and it suggested me to use toPandas.&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;&lt;B&gt;The following solution works: Use &lt;U&gt;toPandas&lt;/U&gt; instead of&lt;U&gt; toPandas()&lt;/U&gt; &lt;/B&gt;&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;PRE&gt;&lt;CODE&gt;data = spark.sql (" SELECT A_adjClose, AA_adjClose, AAL_adjClose, AAP_adjClose, AAPL_adjClose FROM deltabase.a_30min_delta, deltabase.aa_30min_delta, deltabase.aal_30min_delta, deltabase.aap_30min_delta ,deltabase.aapl_30min_delta ")
 
display(data)
 
spark.conf.set("spark.sql.execution.arrow.enabled", "true")
 
data_pd = data.toPandas&lt;/CODE&gt;&lt;/PRE&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Yet, when I tried to calculate percentage change using pct_change(), it didn't work. pct_change() hasn't been put into pyspark.pandas &lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;PRE&gt;&lt;CODE&gt;#This failed because pct_change() function has not been put into pyspark.pandas
df_pct = data_pd.pct_change(1)&lt;/CODE&gt;&lt;/PRE&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;&lt;B&gt;Another solution is to use:&lt;U&gt; pandas_api()&lt;/U&gt; to convert the spark dataframe to pandas-spark dataframe.&lt;/B&gt;&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;This allows me to use pct_change() after converting spark dataframe to pandas-spark dataframe&lt;/P&gt;&lt;PRE&gt;&lt;CODE&gt;data_pd = data.pandas_api()
data_pd.pct_change()&lt;/CODE&gt;&lt;/PRE&gt;&lt;P&gt;&amp;nbsp;Source: &lt;A href="https://stackoverflow.com/questions/73061556/does-pyspark-pandas-support-pandas-pct-change-function/73061837?noredirect=1#comment129063659_73061837" alt="https://stackoverflow.com/questions/73061556/does-pyspark-pandas-support-pandas-pct-change-function/73061837?noredirect=1#comment129063659_73061837" target="_blank"&gt;https://stackoverflow.com/questions/73061556/does-pyspark-pandas-support-pandas-pct-change-function/73061837?noredirect=1#comment129063659_73061837&lt;/A&gt;&lt;/P&gt;</description>
      <pubDate>Tue, 19 Jul 2022 06:39:47 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/failed-to-convert-spark-sql-to-pandas-dataframe-using-topandas/m-p/15102#M9468</guid>
      <dc:creator>Dicer</dc:creator>
      <dc:date>2022-07-19T06:39:47Z</dc:date>
    </item>
  </channel>
</rss>

