<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Re: EXPLAIN PLAN parser in Data Engineering</title>
    <link>https://community.databricks.com/t5/data-engineering/explain-plan-parser/m-p/129380#M48515</link>
    <description>&lt;P&gt;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/179612"&gt;@WiliamRosa&lt;/a&gt;&amp;nbsp;ah thanks much.&lt;/P&gt;&lt;P&gt;Just a final question - it seems the code executes the SQL query &amp;amp; then captures the columns from the plan? If so, my need is to go through around 1000 sqls each day &amp;amp; get the columns used in filters/joins/group by of these. So, executing them is not an option&lt;span class="lia-unicode-emoji" title=":slightly_smiling_face:"&gt;🙂&lt;/span&gt;&lt;/P&gt;</description>
    <pubDate>Sat, 23 Aug 2025 01:39:17 GMT</pubDate>
    <dc:creator>noorbasha534</dc:creator>
    <dc:date>2025-08-23T01:39:17Z</dc:date>
    <item>
      <title>EXPLAIN PLAN parser</title>
      <link>https://community.databricks.com/t5/data-engineering/explain-plan-parser/m-p/129368#M48509</link>
      <description>&lt;P&gt;hello all,&lt;/P&gt;&lt;P&gt;have you come across a freely available parser that does good job for parsing explain plans for SQL queries in databricks...&lt;/P&gt;</description>
      <pubDate>Fri, 22 Aug 2025 22:34:54 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/explain-plan-parser/m-p/129368#M48509</guid>
      <dc:creator>noorbasha534</dc:creator>
      <dc:date>2025-08-22T22:34:54Z</dc:date>
    </item>
    <item>
      <title>Re: EXPLAIN PLAN parser</title>
      <link>https://community.databricks.com/t5/data-engineering/explain-plan-parser/m-p/129371#M48510</link>
      <description>&lt;P&gt;Hi&amp;nbsp;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/124839"&gt;@noorbasha534&lt;/a&gt;,&lt;/P&gt;&lt;P&gt;See if this helps: you can run an EXPLAIN directly in the SQL editor. Try something like:&lt;/P&gt;&lt;LI-CODE lang="python"&gt;EXPLAIN FORMATTED
SELECT c.id, SUM(s.val) v
FROM sales s JOIN customers c ON s.cid = c.id
WHERE s.dt &amp;gt;= '2025-08-01'
GROUP BY c.id;&lt;/LI-CODE&gt;&lt;P&gt;&lt;span class="lia-inline-image-display-wrapper lia-image-align-inline" image-alt="WiliamRosa_0-1755906421026.png" style="width: 400px;"&gt;&lt;img src="https://community.databricks.com/t5/image/serverpage/image-id/19284i3812CE6A75608513/image-size/medium?v=v2&amp;amp;px=400" role="button" title="WiliamRosa_0-1755906421026.png" alt="WiliamRosa_0-1755906421026.png" /&gt;&lt;/span&gt;&lt;/P&gt;&lt;P&gt;You can also view details via the “See performance” icon that appears after the query runs.&lt;/P&gt;&lt;P&gt;&lt;span class="lia-inline-image-display-wrapper lia-image-align-inline" image-alt="WiliamRosa_1-1755906693198.png" style="width: 400px;"&gt;&lt;img src="https://community.databricks.com/t5/image/serverpage/image-id/19285i13BEFF16F50B97F2/image-size/medium?v=v2&amp;amp;px=400" role="button" title="WiliamRosa_1-1755906693198.png" alt="WiliamRosa_1-1755906693198.png" /&gt;&lt;/span&gt;&lt;/P&gt;&lt;P&gt;Hope that helps!&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Fri, 22 Aug 2025 23:53:54 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/explain-plan-parser/m-p/129371#M48510</guid>
      <dc:creator>WiliamRosa</dc:creator>
      <dc:date>2025-08-22T23:53:54Z</dc:date>
    </item>
    <item>
      <title>Re: EXPLAIN PLAN parser</title>
      <link>https://community.databricks.com/t5/data-engineering/explain-plan-parser/m-p/129378#M48513</link>
      <description>&lt;P&gt;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/179612"&gt;@WiliamRosa&lt;/a&gt;&amp;nbsp;thanks. What I meant is a parser that gives me filter columns, join columns from the explain plan&lt;span class="lia-unicode-emoji" title=":slightly_smiling_face:"&gt;🙂&lt;/span&gt;&lt;/P&gt;</description>
      <pubDate>Sat, 23 Aug 2025 01:20:01 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/explain-plan-parser/m-p/129378#M48513</guid>
      <dc:creator>noorbasha534</dc:creator>
      <dc:date>2025-08-23T01:20:01Z</dc:date>
    </item>
    <item>
      <title>Re: EXPLAIN PLAN parser</title>
      <link>https://community.databricks.com/t5/data-engineering/explain-plan-parser/m-p/129379#M48514</link>
      <description>&lt;P&gt;&lt;EM&gt;See if this helps:&lt;/EM&gt;&lt;/P&gt;&lt;P&gt;&lt;EM&gt;&lt;BR /&gt;Import org.apache.spark.sql.DataFrame&lt;/EM&gt;&lt;BR /&gt;&lt;EM&gt;import org.apache.spark.sql.catalyst.plans.logical._&lt;/EM&gt;&lt;BR /&gt;&lt;EM&gt;import org.apache.spark.sql.catalyst.expressions._&lt;/EM&gt;&lt;/P&gt;&lt;P&gt;&lt;EM&gt;def extractFiltersAndJoins(df: DataFrame): (Set[String], Set[(String,String)]) = {&lt;/EM&gt;&lt;BR /&gt;&lt;EM&gt;val plan = df.queryExecution.optimizedPlan&lt;/EM&gt;&lt;BR /&gt;&lt;EM&gt;var filterCols = Set.empty[String]&lt;/EM&gt;&lt;BR /&gt;&lt;EM&gt;var joinCols = Set.empty[(String,String)]&lt;/EM&gt;&lt;/P&gt;&lt;P&gt;&lt;EM&gt;plan.foreach {&lt;/EM&gt;&lt;BR /&gt;&lt;EM&gt;case Filter(condition, _) =&amp;gt;&lt;/EM&gt;&lt;BR /&gt;&lt;EM&gt;condition.references.foreach(ref =&amp;gt; filterCols += ref.sql)&lt;/EM&gt;&lt;BR /&gt;&lt;EM&gt;case Join(left, right, _, condOpt, _) =&amp;gt;&lt;/EM&gt;&lt;BR /&gt;&lt;EM&gt;condOpt.foreach { cond =&amp;gt;&lt;/EM&gt;&lt;BR /&gt;&lt;EM&gt;cond.references.toSeq.combinations(2).foreach {&lt;/EM&gt;&lt;BR /&gt;&lt;EM&gt;case Seq(a,b) =&amp;gt; joinCols += (a.sql -&amp;gt; b.sql)&lt;/EM&gt;&lt;BR /&gt;&lt;EM&gt;case _ =&amp;gt;&lt;/EM&gt;&lt;BR /&gt;&lt;EM&gt;}&lt;/EM&gt;&lt;BR /&gt;&lt;EM&gt;}&lt;/EM&gt;&lt;BR /&gt;&lt;EM&gt;case _ =&amp;gt;&lt;/EM&gt;&lt;BR /&gt;&lt;EM&gt;}&lt;/EM&gt;&lt;BR /&gt;&lt;EM&gt;(filterCols, joinCols)&lt;/EM&gt;&lt;BR /&gt;&lt;EM&gt;}&lt;/EM&gt;&lt;/P&gt;&lt;P&gt;&lt;EM&gt;// use it&lt;/EM&gt;&lt;BR /&gt;&lt;EM&gt;val df = spark.sql("""&lt;/EM&gt;&lt;BR /&gt;&lt;EM&gt;SELECT c.id, SUM(s.val) v&lt;/EM&gt;&lt;BR /&gt;&lt;EM&gt;FROM sales s JOIN customers c ON s.cid = c.id&lt;/EM&gt;&lt;BR /&gt;&lt;EM&gt;WHERE s.dt &amp;gt;= DATE '2025-08-01'&lt;/EM&gt;&lt;BR /&gt;&lt;EM&gt;GROUP BY c.id&lt;/EM&gt;&lt;BR /&gt;&lt;EM&gt;""")&lt;/EM&gt;&lt;BR /&gt;&lt;EM&gt;val (filters, joins) = extractFiltersAndJoins(df)&lt;/EM&gt;&lt;BR /&gt;&lt;EM&gt;println("Filter columns: " + filters)&lt;/EM&gt;&lt;BR /&gt;println("Join columns: " + joins)&lt;/P&gt;</description>
      <pubDate>Sat, 23 Aug 2025 01:33:03 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/explain-plan-parser/m-p/129379#M48514</guid>
      <dc:creator>WiliamRosa</dc:creator>
      <dc:date>2025-08-23T01:33:03Z</dc:date>
    </item>
    <item>
      <title>Re: EXPLAIN PLAN parser</title>
      <link>https://community.databricks.com/t5/data-engineering/explain-plan-parser/m-p/129380#M48515</link>
      <description>&lt;P&gt;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/179612"&gt;@WiliamRosa&lt;/a&gt;&amp;nbsp;ah thanks much.&lt;/P&gt;&lt;P&gt;Just a final question - it seems the code executes the SQL query &amp;amp; then captures the columns from the plan? If so, my need is to go through around 1000 sqls each day &amp;amp; get the columns used in filters/joins/group by of these. So, executing them is not an option&lt;span class="lia-unicode-emoji" title=":slightly_smiling_face:"&gt;🙂&lt;/span&gt;&lt;/P&gt;</description>
      <pubDate>Sat, 23 Aug 2025 01:39:17 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/explain-plan-parser/m-p/129380#M48515</guid>
      <dc:creator>noorbasha534</dc:creator>
      <dc:date>2025-08-23T01:39:17Z</dc:date>
    </item>
    <item>
      <title>Re: EXPLAIN PLAN parser</title>
      <link>https://community.databricks.com/t5/data-engineering/explain-plan-parser/m-p/129381#M48516</link>
      <description>&lt;P&gt;&lt;SPAN&gt;got it. If execution is not an option, you can still extract columns without running the queries by parsing their AST.&lt;/SPAN&gt;&lt;/P&gt;&lt;P&gt;&lt;SPAN&gt;Use Spark’s internal SQL parser (no execution)&lt;/SPAN&gt;&lt;/P&gt;&lt;P&gt;&lt;SPAN&gt;You can parse to an unresolved logical plan and walk the tree for Filter, Join, and Aggregate nodes. This avoids query execution and catalog reads on data (though the analyzer may still look up objects if you resolve). In Scala:&lt;/SPAN&gt;&lt;/P&gt;&lt;LI-CODE lang="python"&gt;import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.catalyst.plans.logical._
import org.apache.spark.sql.catalyst.expressions._

val spark: SparkSession = SparkSession.getActiveSession.get
val parser = spark.sessionState.sqlParser // parse only, no execution

def colsFromExpr(e: Expression): Set[String] =
e.references.map(_.sql).toSet

case class Parsed(colsFilter: Set[String], colsJoin: Set[(String,String)], colsGroupBy: Set[String])

def scan(sql: String): Parsed = {
val plan = parser.parsePlan(sql) // Unresolved plan (no execution)
var filterCols = Set.empty[String]
var joinCols = Set.empty[(String,String)]
var groupCols = Set.empty[String]

plan.foreach {
case f: Filter =&amp;gt;
filterCols ++= colsFromExpr(f.condition)

case j: Join =&amp;gt;
j.condition.foreach { cond =&amp;gt;
val refs = cond.references.toSeq.map(_.sql).distinct
// pairwise for simple equi-joins
for {
i &amp;lt;- refs.indices
j &amp;lt;- (i+1) until refs.length
} joinCols += (refs(i) -&amp;gt; refs(j))
}

case a: Aggregate =&amp;gt;
groupCols ++= a.groupingExpressions.flatMap(e =&amp;gt; colsFromExpr(e))

case _ =&amp;gt;
}
Parsed(filterCols, joinCols, groupCols)
}

// Example
val sql =
"""SELECT c.id, SUM(s.val)
FROM sales s JOIN customers c ON s.cid = c.id
WHERE s.dt &amp;gt;= DATE '2025-08-01'
GROUP BY c.id"""

val out = scan(sql)
println(s"Filter: ${out.colsFilter}")
println(s"Joins: ${out.colsJoin}")
println(s"Group By: ${out.colsGroupBy}")&lt;/LI-CODE&gt;</description>
      <pubDate>Sat, 23 Aug 2025 01:50:04 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/explain-plan-parser/m-p/129381#M48516</guid>
      <dc:creator>WiliamRosa</dc:creator>
      <dc:date>2025-08-23T01:50:04Z</dc:date>
    </item>
  </channel>
</rss>

