<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Re: Transpile a SQL Script into PySpark DataFrame API equivalent code in Data Engineering</title>
    <link>https://community.databricks.com/t5/data-engineering/transpile-a-sql-script-into-pyspark-dataframe-api-equivalent/m-p/70108#M33992</link>
    <description>&lt;P&gt;&lt;SPAN class=""&gt;Hi &lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/9"&gt;@Retired_mod&lt;/a&gt;,&lt;/SPAN&gt;&amp;nbsp;Thanks for your response.&amp;nbsp;&lt;BR /&gt;I'm looking for a utility or an automated way of translating any generic SQL into PySpark DataFrame code.&lt;/P&gt;&lt;P&gt;So, the translate function should look like below:&lt;BR /&gt;&lt;BR /&gt;def translate(input_sql):&lt;BR /&gt;&amp;nbsp; &amp;nbsp; # translate/convert it into pyspark dataframe code&lt;BR /&gt;&amp;nbsp; &amp;nbsp; return pyspark_code&lt;BR /&gt;&lt;BR /&gt;Please suggest&lt;BR /&gt;&lt;BR /&gt;&lt;/P&gt;</description>
    <pubDate>Tue, 21 May 2024 12:25:56 GMT</pubDate>
    <dc:creator>thecodecache</dc:creator>
    <dc:date>2024-05-21T12:25:56Z</dc:date>
    <item>
      <title>Transpile a SQL Script into PySpark DataFrame API equivalent code</title>
      <link>https://community.databricks.com/t5/data-engineering/transpile-a-sql-script-into-pyspark-dataframe-api-equivalent/m-p/68532#M33989</link>
      <description>&lt;P&gt;&lt;STRONG&gt;Input&lt;SPAN&gt;&amp;nbsp;&lt;/SPAN&gt;&lt;FONT color="#FF0000"&gt;SQL&lt;/FONT&gt;&lt;SPAN&gt;&amp;nbsp;&lt;/SPAN&gt;Script (assume any dialect) :&lt;/STRONG&gt;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;PRE&gt;&lt;SPAN class=""&gt;SELECT&lt;/SPAN&gt; b.se10,
       b.se3,
       b.se_aggrtr_indctr,
       b.key_swipe_ind
&lt;SPAN class=""&gt;FROM&lt;/SPAN&gt;
  (&lt;SPAN class=""&gt;SELECT&lt;/SPAN&gt; se10,
          se3,
          se_aggrtr_indctr,
          &lt;SPAN class=""&gt;ROW_NUMBER&lt;/SPAN&gt;() &lt;SPAN class=""&gt;OVER&lt;/SPAN&gt; (&lt;SPAN class=""&gt;PARTITION&lt;/SPAN&gt; &lt;SPAN class=""&gt;BY&lt;/SPAN&gt; SE10
                             &lt;SPAN class=""&gt;ORDER&lt;/SPAN&gt; &lt;SPAN class=""&gt;BY&lt;/SPAN&gt; se_aggrtr_indctr &lt;SPAN class=""&gt;DESC&lt;/SPAN&gt;) &lt;SPAN class=""&gt;AS&lt;/SPAN&gt; rn,
          key_swipe_ind
   &lt;SPAN class=""&gt;FROM&lt;/SPAN&gt; fraud_details_data_whole
   &lt;SPAN class=""&gt;GROUP&lt;/SPAN&gt; &lt;SPAN class=""&gt;BY&lt;/SPAN&gt; se10,
            se3,
            se_aggrtr_indctr ,
            key_swipe_ind) b
&lt;SPAN class=""&gt;WHERE&lt;/SPAN&gt; b.rn&lt;SPAN class=""&gt;&amp;lt;&lt;/SPAN&gt;&lt;SPAN class=""&gt;2&lt;/SPAN&gt;
&lt;/PRE&gt;&lt;P&gt;&lt;STRONG&gt;Output&lt;SPAN&gt;&amp;nbsp;&lt;/SPAN&gt;&lt;FONT color="#FF0000"&gt;PySpark&lt;/FONT&gt;&lt;SPAN&gt;&amp;nbsp;&lt;/SPAN&gt;Code using&lt;SPAN&gt;&amp;nbsp;&lt;/SPAN&gt;&lt;FONT color="#FF0000"&gt;DataFrame&lt;/FONT&gt;&lt;SPAN&gt;&amp;nbsp;&lt;/SPAN&gt;API :&lt;/STRONG&gt;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;PRE&gt;&lt;SPAN class=""&gt;from&lt;/SPAN&gt; pyspark.sql &lt;SPAN class=""&gt;import&lt;/SPAN&gt; SparkSession
&lt;SPAN class=""&gt;from&lt;/SPAN&gt; pyspark.sql.window &lt;SPAN class=""&gt;import&lt;/SPAN&gt; Window
&lt;SPAN class=""&gt;from&lt;/SPAN&gt; pyspark.sql.functions &lt;SPAN class=""&gt;import&lt;/SPAN&gt; row_number

&lt;SPAN class=""&gt;# Create a SparkSession&lt;/SPAN&gt;
spark = SparkSession.builder \
    .appName(&lt;SPAN class=""&gt;"TranspileSQLExample"&lt;/SPAN&gt;) \
    .getOrCreate()

&lt;SPAN class=""&gt;# Sample data (replace with your actual DataFrame)&lt;/SPAN&gt;
data = [
    (&lt;SPAN class=""&gt;"se10_value1"&lt;/SPAN&gt;, &lt;SPAN class=""&gt;"se3_value1"&lt;/SPAN&gt;, &lt;SPAN class=""&gt;"aggrtr_value1"&lt;/SPAN&gt;, &lt;SPAN class=""&gt;"swipe_value1"&lt;/SPAN&gt;),
    (&lt;SPAN class=""&gt;"se10_value1"&lt;/SPAN&gt;, &lt;SPAN class=""&gt;"se3_value2"&lt;/SPAN&gt;, &lt;SPAN class=""&gt;"aggrtr_value2"&lt;/SPAN&gt;, &lt;SPAN class=""&gt;"swipe_value2"&lt;/SPAN&gt;),
    (&lt;SPAN class=""&gt;"se10_value2"&lt;/SPAN&gt;, &lt;SPAN class=""&gt;"se3_value3"&lt;/SPAN&gt;, &lt;SPAN class=""&gt;"aggrtr_value3"&lt;/SPAN&gt;, &lt;SPAN class=""&gt;"swipe_value3"&lt;/SPAN&gt;),
    (&lt;SPAN class=""&gt;"se10_value2"&lt;/SPAN&gt;, &lt;SPAN class=""&gt;"se3_value4"&lt;/SPAN&gt;, &lt;SPAN class=""&gt;"aggrtr_value4"&lt;/SPAN&gt;, &lt;SPAN class=""&gt;"swipe_value4"&lt;/SPAN&gt;),
    (&lt;SPAN class=""&gt;"se10_value3"&lt;/SPAN&gt;, &lt;SPAN class=""&gt;"se3_value5"&lt;/SPAN&gt;, &lt;SPAN class=""&gt;"aggrtr_value5"&lt;/SPAN&gt;, &lt;SPAN class=""&gt;"swipe_value5"&lt;/SPAN&gt;),
    (&lt;SPAN class=""&gt;"se10_value3"&lt;/SPAN&gt;, &lt;SPAN class=""&gt;"se3_value6"&lt;/SPAN&gt;, &lt;SPAN class=""&gt;"aggrtr_value6"&lt;/SPAN&gt;, &lt;SPAN class=""&gt;"swipe_value6"&lt;/SPAN&gt;)
]

&lt;SPAN class=""&gt;# Create DataFrame&lt;/SPAN&gt;
fraud_details_data_whole = spark.createDataFrame(data, [&lt;SPAN class=""&gt;"se10"&lt;/SPAN&gt;, &lt;SPAN class=""&gt;"se3"&lt;/SPAN&gt;, &lt;SPAN class=""&gt;"se_aggrtr_indctr"&lt;/SPAN&gt;, &lt;SPAN class=""&gt;"key_swipe_ind"&lt;/SPAN&gt;])

&lt;SPAN class=""&gt;# Define Window specification for row_number() function&lt;/SPAN&gt;
windowSpec = Window.partitionBy(&lt;SPAN class=""&gt;"se10"&lt;/SPAN&gt;).orderBy(fraud_details_data_whole[&lt;SPAN class=""&gt;"se_aggrtr_indctr"&lt;/SPAN&gt;].desc())

&lt;SPAN class=""&gt;# Add row number column&lt;/SPAN&gt;
fraud_details_data_whole = fraud_details_data_whole.withColumn(&lt;SPAN class=""&gt;"rn"&lt;/SPAN&gt;, row_number().over(windowSpec))

&lt;SPAN class=""&gt;# Filter rows where rn &amp;lt; 2&lt;/SPAN&gt;
result_df = fraud_details_data_whole.&lt;SPAN class=""&gt;filter&lt;/SPAN&gt;(&lt;SPAN class=""&gt;"rn &amp;lt; 2"&lt;/SPAN&gt;)

&lt;SPAN class=""&gt;# Select required columns&lt;/SPAN&gt;
result_df = result_df.select(&lt;SPAN class=""&gt;"se10"&lt;/SPAN&gt;, &lt;SPAN class=""&gt;"se3"&lt;/SPAN&gt;, &lt;SPAN class=""&gt;"se_aggrtr_indctr"&lt;/SPAN&gt;, &lt;SPAN class=""&gt;"key_swipe_ind"&lt;/SPAN&gt;)

&lt;SPAN class=""&gt;# Show the result DataFrame&lt;/SPAN&gt;
result_df.show()&lt;/PRE&gt;&lt;P&gt;Is there any way to&lt;SPAN&gt;&amp;nbsp;&lt;/SPAN&gt;&lt;FONT color="#FF0000"&gt;translate&lt;/FONT&gt;&lt;SPAN&gt;&amp;nbsp;&lt;/SPAN&gt;the above&lt;SPAN&gt;&amp;nbsp;&lt;/SPAN&gt;&lt;FONT color="#FF0000"&gt;Sql&lt;SPAN&gt;&amp;nbsp;&lt;/SPAN&gt;Query&lt;/FONT&gt;&lt;SPAN&gt;&amp;nbsp;&lt;/SPAN&gt;into its equivalent&lt;SPAN&gt;&amp;nbsp;&lt;/SPAN&gt;&lt;FONT color="#FF0000"&gt;PySpark DataFrame&lt;SPAN&gt;&amp;nbsp;&lt;/SPAN&gt;API&lt;/FONT&gt;&lt;SPAN&gt;&amp;nbsp;&lt;/SPAN&gt;code? The result must be equal when we execute&lt;SPAN&gt;&amp;nbsp;&lt;/SPAN&gt;&lt;FONT color="#FF0000"&gt;Sql&lt;SPAN&gt;&amp;nbsp;&lt;/SPAN&gt;Script&lt;/FONT&gt;&lt;SPAN&gt;&amp;nbsp;&lt;/SPAN&gt;and its transpiled&lt;SPAN&gt;&amp;nbsp;&lt;/SPAN&gt;&lt;FONT color="#FF0000"&gt;PySpark&lt;SPAN&gt;&amp;nbsp;&lt;/SPAN&gt;code&lt;/FONT&gt;&lt;SPAN&gt;&amp;nbsp;&lt;/SPAN&gt;separately.&lt;/P&gt;&lt;P&gt;Kindly suggest&lt;/P&gt;&lt;P&gt;&lt;A href="https://stackoverflow.com/q/78446543/6842300" target="_blank" rel="nofollow noopener noreferrer"&gt;https://stackoverflow.com/q/78446543/6842300&lt;/A&gt;&lt;/P&gt;</description>
      <pubDate>Wed, 08 May 2024 07:09:52 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/transpile-a-sql-script-into-pyspark-dataframe-api-equivalent/m-p/68532#M33989</guid>
      <dc:creator>thecodecache</dc:creator>
      <dc:date>2024-05-08T07:09:52Z</dc:date>
    </item>
    <item>
      <title>Re: Transpile a SQL Script into PySpark DataFrame API equivalent code</title>
      <link>https://community.databricks.com/t5/data-engineering/transpile-a-sql-script-into-pyspark-dataframe-api-equivalent/m-p/70108#M33992</link>
      <description>&lt;P&gt;&lt;SPAN class=""&gt;Hi &lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/9"&gt;@Retired_mod&lt;/a&gt;,&lt;/SPAN&gt;&amp;nbsp;Thanks for your response.&amp;nbsp;&lt;BR /&gt;I'm looking for a utility or an automated way of translating any generic SQL into PySpark DataFrame code.&lt;/P&gt;&lt;P&gt;So, the translate function should look like below:&lt;BR /&gt;&lt;BR /&gt;def translate(input_sql):&lt;BR /&gt;&amp;nbsp; &amp;nbsp; # translate/convert it into pyspark dataframe code&lt;BR /&gt;&amp;nbsp; &amp;nbsp; return pyspark_code&lt;BR /&gt;&lt;BR /&gt;Please suggest&lt;BR /&gt;&lt;BR /&gt;&lt;/P&gt;</description>
      <pubDate>Tue, 21 May 2024 12:25:56 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/transpile-a-sql-script-into-pyspark-dataframe-api-equivalent/m-p/70108#M33992</guid>
      <dc:creator>thecodecache</dc:creator>
      <dc:date>2024-05-21T12:25:56Z</dc:date>
    </item>
    <item>
      <title>Re: Transpile a SQL Script into PySpark DataFrame API equivalent code</title>
      <link>https://community.databricks.com/t5/data-engineering/transpile-a-sql-script-into-pyspark-dataframe-api-equivalent/m-p/106127#M42394</link>
      <description>&lt;P&gt;Hello&amp;nbsp;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/104747"&gt;@thecodecache&lt;/a&gt;&amp;nbsp;,&lt;/P&gt;
&lt;P&gt;Have a look the SQLGlot project:&amp;nbsp;&lt;A href="https://github.com/tobymao/sqlglot?tab=readme-ov-file#faq" target="_blank"&gt;https://github.com/tobymao/sqlglot?tab=readme-ov-file#faq&lt;/A&gt;&lt;/P&gt;
&lt;P&gt;It can easily transpile SQL to Spark SQL, like that:&lt;/P&gt;
&lt;LI-CODE lang="markup"&gt;import sqlglot
from pyspark.sql import SparkSession

# Initialize Spark session
spark = SparkSession.builder.appName("SQLGlot Example").getOrCreate()

# Original SQL query
sql_query = """
SELECT bar.a, b + 1 AS b
FROM bar
JOIN baz ON bar.a = baz.a
WHERE bar.a &amp;gt; 1
"""

# Convert SQL to Spark SQL dialect
spark_sql = sqlglot.transpile(sql_query, read="generic", write="spark")[0]

# Create a DataFrame from the Spark SQL query
df = spark.sql(spark_sql)

# Show the resulting DataFrame
df.show()
&lt;/LI-CODE&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Fri, 17 Jan 2025 18:17:33 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/transpile-a-sql-script-into-pyspark-dataframe-api-equivalent/m-p/106127#M42394</guid>
      <dc:creator>MathieuDB</dc:creator>
      <dc:date>2025-01-17T18:17:33Z</dc:date>
    </item>
  </channel>
</rss>

