<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Re: Trying to build subquery in Databricks notebook, similar to SQL in a data frame with the Top(1) in Data Engineering</title>
    <link>https://community.databricks.com/t5/data-engineering/trying-to-build-subquery-in-databricks-notebook-similar-to-sql/m-p/5017#M1553</link>
    <description>&lt;P&gt;Top keyword is not supported so in place of that you can do one thing order your data in desc and then use the limit 1 in last it will give only the max record so you can modify your query according to that &lt;/P&gt;</description>
    <pubDate>Mon, 01 May 2023 10:42:53 GMT</pubDate>
    <dc:creator>Rishabh-Pandey</dc:creator>
    <dc:date>2023-05-01T10:42:53Z</dc:date>
    <item>
      <title>Trying to build subquery in Databricks notebook, similar to SQL in a data frame with the Top(1)</title>
      <link>https://community.databricks.com/t5/data-engineering/trying-to-build-subquery-in-databricks-notebook-similar-to-sql/m-p/5016#M1552</link>
      <description>&lt;P&gt;Hello Everyone,&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;I am new to Databricks, so I am at the learning stage.&amp;nbsp;&lt;/P&gt;&lt;P&gt;It would be very helpful if someone helps in resolving the issue or I can say helped me to fix my code.&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;I have built the query that fetches the data based on CASE, in Case I have a used sub-query of the same table that returns the data based on TOP(1), order by date desc.&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;I tried to do it using Pyspark and after a lot of R&amp;amp;D I found that order by and select top 1 is not used in Pyspark, &lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Please provide me some solution for this. &lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Original Code in SQL:&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;PRE&gt;&lt;CODE&gt;SELECT distinct
		@CompanyId,
		curr.t_ccur,
		@TargetCurrency,
		curr.t_rtyp,
		curr.t_stdt,
		CASE WHEN @Basecurrency = @TargetCurrency THEN 1
			 WHEN @Basecurrency &amp;lt;&amp;gt; @ReferenceCurrency and @TargetCurrency &amp;lt;&amp;gt; @ReferenceCurrency THEN
				(SELECT TOP(1) (CASE WHEN it2.t_excb = 2 THEN (select top(1) it.t_rate from dbo.currenciestable it where it.t_bcur = @ReferenceCurrency and it.t_ccur = @Basecurrency and it.t_stdt &amp;lt;= curr.t_stdt and it.t_rtyp = curr.t_rtyp order by it.t_stdt desc) /it2.t_rate 
								ELSE it2.t_rate * (select top(1) it.t_rate from dbo.currenciestable it where it.t_bcur = @ReferenceCurrency and it.t_ccur = @Basecurrency and it.t_stdt &amp;lt;= curr.t_stdt and it.t_rtyp = curr.t_rtyp order by it.t_stdt desc) END) From ttcmcs008100 it2 where it2.t_bcur = @ReferenceCurrency and it2.t_ccur = @TargetCurrency and it2.t_stdt &amp;lt;= curr.t_stdt and it2.t_rtyp = curr.t_rtyp order by it2.t_stdt desc)
				
		        ELSE 0 END
	from dbo.currenciestable curr
	where curr.t_bcur = @ReferenceCurrency and curr.t_ccur = @Basecurrency&lt;/CODE&gt;&lt;/PRE&gt;&lt;P&gt;Tried to replicate the above code in Pyspark, it is executing but giving the wrong data&lt;/P&gt;&lt;PRE&gt;&lt;CODE&gt;from pyspark.sql.functions import current_timestamp
&amp;nbsp;
CompanyId = 100
Basecurrency = 'USD'
TargetCurrency = 'SAR'
ReferenceCurrency = 'AED'
&amp;nbsp;
w = spark.sql(f"""
SELECT Distinct
'{CompanyId}' AS CompanyId,
curr.t_ccur AS BaseCurrency,
'{TargetCurrency}' AS TargetCurrency,
curr.t_rtyp AS RateType,
curr.t_stdt AS EffectiveDate,
    CASE WHEN ('{Basecurrency}' == '{TargetCurrency}') THEN 
        WHEN ('{Basecurrency}' &amp;lt;&amp;gt; '{ReferenceCurrency}' and '{TargetCurrency}' &amp;lt;&amp;gt; '{ReferenceCurrency}') THEN 
        (SELECT first(CASE WHEN it2.t_excb = 2 THEN (select first(it1.t_rate) from currenciestable it1 inner join currenciestablecurr on it1.t_stdt &amp;lt;= curr.t_stdt and it1.t_rtyp = curr.t_rtyp and it1.t_bcur = '{ReferenceCurrency}' and it1.t_ccur = '{Basecurrency}' ) /it2.t_rate ELSE it2.t_rate * (select first(it.t_rate) from currenciestable it inner join currenciestable curr on it.t_bcur = '{ReferenceCurrency}' and it.t_ccur = '{Basecurrency}' and it.t_stdt &amp;lt;= curr.t_stdt and it.t_rtyp = curr.t_rtyp ) END) From currenciestable it2 inner join currenciestable curr ON it2.t_bcur = '{ReferenceCurrency}' and it2.t_ccur = '{TargetCurrency}' and it2.t_stdt &amp;lt;= curr.t_stdt and it2.t_rtyp = curr.t_rtyp )															
																
							     
    ELSE 0 END AS Rate
from currenciestable curr
where curr.t_bcur = '{ReferenceCurrency}' and curr.t_ccur = '{Basecurrency}'
    """)
display(w)&lt;/CODE&gt;&lt;/PRE&gt;&lt;P&gt;Moreover, I tried to replicate the same using Dataframe, &lt;/P&gt;&lt;PRE&gt;&lt;CODE&gt;from pyspark.sql.functions import when, col
&amp;nbsp;
df_empty = df_empty.withColumn(
    "Rate",
    when(
        (df_empty["Basecurrency"] == df_empty["TargetCurrency"]),
        1
    )
    .when(
        (
            (df_empty["Basecurrency"] != df_empty["ReferenceCurrency"]) &amp;amp; 
            (df_empty["TargetCurrency"] != df_empty["ReferenceCurrency"])
        ),
        when(
            df_empty["ExpressionBaseCurrency"] == 2,
            (
                df_curr.select("t_rate")
                .where(
                    (df_curr["t_bcur"] == col("t_bcur")) &amp;amp;
                    (df_curr["t_ccur"] == col("t_ccur")) &amp;amp;
                    (df_curr["t_stdt"] &amp;lt;= col("t_stdt")) &amp;amp;
                    (df_curr["t_rtyp"] == col("t_rtyp"))
                )
                .orderBy(df_curr["t_stdt"].desc())
                .limit(1)
                .selectExpr("t_rate / t_rate as rate")
            )
            .otherwise(
                df_curr.select("t_rate")
                .where(
                    (col("t_bcur") == col("t_bcur")) &amp;amp;
                    (col("t_ccur") == col("t_ccur")) &amp;amp;
                    (col("t_stdt") &amp;lt;= col("t_stdt")) &amp;amp;
                    (col("t_rtyp") == col("t_rtyp"))
                )
                .orderBy(col("t_stdt").desc())
                .limit(1)
                .selectExpr("t_rate * t_rate as rate")
            )
        )
    )
    .otherwise(df_empty["Rate"])
)&lt;/CODE&gt;&lt;/PRE&gt;&lt;P&gt;Kindly help &lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Thanks in advance&lt;/P&gt;</description>
      <pubDate>Sun, 30 Apr 2023 05:15:56 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/trying-to-build-subquery-in-databricks-notebook-similar-to-sql/m-p/5016#M1552</guid>
      <dc:creator>DeviJaviya</dc:creator>
      <dc:date>2023-04-30T05:15:56Z</dc:date>
    </item>
    <item>
      <title>Re: Trying to build subquery in Databricks notebook, similar to SQL in a data frame with the Top(1)</title>
      <link>https://community.databricks.com/t5/data-engineering/trying-to-build-subquery-in-databricks-notebook-similar-to-sql/m-p/5017#M1553</link>
      <description>&lt;P&gt;Top keyword is not supported so in place of that you can do one thing order your data in desc and then use the limit 1 in last it will give only the max record so you can modify your query according to that &lt;/P&gt;</description>
      <pubDate>Mon, 01 May 2023 10:42:53 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/trying-to-build-subquery-in-databricks-notebook-similar-to-sql/m-p/5017#M1553</guid>
      <dc:creator>Rishabh-Pandey</dc:creator>
      <dc:date>2023-05-01T10:42:53Z</dc:date>
    </item>
    <item>
      <title>Re: Trying to build subquery in Databricks notebook, similar to SQL in a data frame with the Top(1)</title>
      <link>https://community.databricks.com/t5/data-engineering/trying-to-build-subquery-in-databricks-notebook-similar-to-sql/m-p/5018#M1554</link>
      <description>&lt;P&gt;Hello Rishabh,&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Thank you for your suggestion, we tried to limit 1 but the output values are coming the same for all the dates. which is not correct.&lt;/P&gt;</description>
      <pubDate>Thu, 04 May 2023 03:40:16 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/trying-to-build-subquery-in-databricks-notebook-similar-to-sql/m-p/5018#M1554</guid>
      <dc:creator>DeviJaviya</dc:creator>
      <dc:date>2023-05-04T03:40:16Z</dc:date>
    </item>
  </channel>
</rss>

