<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Databricks Pyspark filter several columns with similar criteria in Get Started Discussions</title>
    <link>https://community.databricks.com/t5/get-started-discussions/databricks-pyspark-filter-several-columns-with-similar-criteria/m-p/93658#M8553</link>
    <description>&lt;P&gt;I am querying a table from the Databricks Catalog which I have to filter several columns with the same criteria.&amp;nbsp; below is what I have created so far.&amp;nbsp; I have 10 columns that I have filter with a set of criteria from (dx_list1) and another 10 that I have to filter with another set of criteria (dx_list2).&amp;nbsp;&amp;nbsp;&lt;/P&gt;&lt;P&gt;I have started doing this: col("DX_00").isin(dx_list1) | col("DX_01").isin (dx_list2) and was planning to go all the way to DX_19.&amp;nbsp;&amp;nbsp;&lt;/P&gt;&lt;P&gt;I am wondering if there is a more efficient way to get the same results or is this as good as it gets.&amp;nbsp; Thank you&lt;/P&gt;&lt;P&gt;Code below:&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;dx_list1 = ['J984', 'J466', 'J754', 'J64', 'J465', 'J445']&lt;/P&gt;&lt;P&gt;dx_list2 = ['J454','J445','J443','J76','J487','J765','J765']&lt;/P&gt;&lt;P&gt;test = spark.table("claim").select("ID","MEMB_KEY","PROV_KEY","CL#","DOS","DX_00","DX_01","DX_02","DX_03",&lt;BR /&gt;"DX_04","DX_05","DX_06","DX_07","DX_08","DX_09","DX_10")&lt;/P&gt;&lt;P&gt;.filter(&lt;BR /&gt;col("DX_00").isin(dx_list1) | col("DX_01").isin (dx_list2) &amp;amp;&lt;BR /&gt;col("DX_02").isin(dx_list1) | col("DX_03").isin (dx_list2) &amp;amp;&lt;BR /&gt;col("DX_04").isin(dx_list1) | col("DX_05").isin (dx_list2)&lt;BR /&gt;)&lt;/P&gt;&lt;P&gt;display(test)&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
    <pubDate>Sat, 12 Oct 2024 01:04:38 GMT</pubDate>
    <dc:creator>abueno</dc:creator>
    <dc:date>2024-10-12T01:04:38Z</dc:date>
    <item>
      <title>Databricks Pyspark filter several columns with similar criteria</title>
      <link>https://community.databricks.com/t5/get-started-discussions/databricks-pyspark-filter-several-columns-with-similar-criteria/m-p/93658#M8553</link>
      <description>&lt;P&gt;I am querying a table from the Databricks Catalog which I have to filter several columns with the same criteria.&amp;nbsp; below is what I have created so far.&amp;nbsp; I have 10 columns that I have filter with a set of criteria from (dx_list1) and another 10 that I have to filter with another set of criteria (dx_list2).&amp;nbsp;&amp;nbsp;&lt;/P&gt;&lt;P&gt;I have started doing this: col("DX_00").isin(dx_list1) | col("DX_01").isin (dx_list2) and was planning to go all the way to DX_19.&amp;nbsp;&amp;nbsp;&lt;/P&gt;&lt;P&gt;I am wondering if there is a more efficient way to get the same results or is this as good as it gets.&amp;nbsp; Thank you&lt;/P&gt;&lt;P&gt;Code below:&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;dx_list1 = ['J984', 'J466', 'J754', 'J64', 'J465', 'J445']&lt;/P&gt;&lt;P&gt;dx_list2 = ['J454','J445','J443','J76','J487','J765','J765']&lt;/P&gt;&lt;P&gt;test = spark.table("claim").select("ID","MEMB_KEY","PROV_KEY","CL#","DOS","DX_00","DX_01","DX_02","DX_03",&lt;BR /&gt;"DX_04","DX_05","DX_06","DX_07","DX_08","DX_09","DX_10")&lt;/P&gt;&lt;P&gt;.filter(&lt;BR /&gt;col("DX_00").isin(dx_list1) | col("DX_01").isin (dx_list2) &amp;amp;&lt;BR /&gt;col("DX_02").isin(dx_list1) | col("DX_03").isin (dx_list2) &amp;amp;&lt;BR /&gt;col("DX_04").isin(dx_list1) | col("DX_05").isin (dx_list2)&lt;BR /&gt;)&lt;/P&gt;&lt;P&gt;display(test)&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Sat, 12 Oct 2024 01:04:38 GMT</pubDate>
      <guid>https://community.databricks.com/t5/get-started-discussions/databricks-pyspark-filter-several-columns-with-similar-criteria/m-p/93658#M8553</guid>
      <dc:creator>abueno</dc:creator>
      <dc:date>2024-10-12T01:04:38Z</dc:date>
    </item>
    <item>
      <title>Re: Databricks Pyspark filter several columns with similar criteria</title>
      <link>https://community.databricks.com/t5/get-started-discussions/databricks-pyspark-filter-several-columns-with-similar-criteria/m-p/93694#M8554</link>
      <description>&lt;P&gt;Hi&amp;nbsp;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/114593"&gt;@abueno&lt;/a&gt;&amp;nbsp;,&lt;BR /&gt;&lt;BR /&gt;As I understand the logic you want to implement is:&lt;BR /&gt;1. For every pair of columns:&lt;/P&gt;&lt;UL&gt;&lt;LI&gt;First Column (DX_i): Must be in dx_list1&lt;/LI&gt;&lt;LI&gt;Second Column (DX_{i+1}): Must be in dx_list2&lt;/LI&gt;&lt;/UL&gt;&lt;P&gt;2.&amp;nbsp;The condition for each pair is:&lt;/P&gt;&lt;UL&gt;&lt;LI&gt;col('DX_i').isin(dx_list1) OR col('DX_{i+1}').isin(dx_list2)&lt;/LI&gt;&lt;/UL&gt;&lt;P&gt;3.&amp;nbsp;All pairwise conditions are combined using AND&lt;/P&gt;&lt;P&gt;In other words, you want to filter rows where for each pair of columns, at least one of the columns satisfies the corresponding criteria, and all pairs must satisfy this condition.&lt;BR /&gt;&lt;BR /&gt;If the logic is different, please clarify. The below example is based on this logic.&amp;nbsp;&lt;BR /&gt;&lt;BR /&gt;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;LI-CODE lang="python"&gt;# Sample data with records designed to be filtered out
data = [
    # ID, MEMB_KEY, PROV_KEY, CL#, DOS, DX_00 to DX_11
    # Records that should be included
    (1, 'M001', 'P001', 'CL001', '2022-01-01',
     'J984', 'A123', 'J984', 'A123', 'J984', 'A123',
     'J984', 'A123', 'J984', 'A123', 'J984', 'A123'),
    (2, 'M002', 'P002', 'CL002', '2022-01-02',
     'A123', 'J445', 'A123', 'J445', 'A123', 'J445',
     'A123', 'J445', 'A123', 'J445', 'A123', 'J445'),
    (3, 'M003', 'P003', 'CL003', '2022-01-03',
     'J984', 'J454', 'J984', 'J454', 'J984', 'J454',
     'J984', 'J454', 'J984', 'J454', 'J984', 'J454'),
    (4, 'M004', 'P004', 'CL004', '2022-01-04',
     'J465', 'J76', 'J465', 'J76', 'J465', 'J76',
     'J465', 'J76', 'J465', 'J76', 'J465', 'J76'),
    (5, 'M005', 'P005', 'CL005', '2022-01-05',
     'J754', 'J765', 'J754', 'J765', 'J754', 'J765',
     'J754', 'J765', 'J754', 'J765', 'J754', 'J765'),
    # Records that should be excluded
    (6, 'M006', 'P006', 'CL006', '2022-01-06',
     'A123', 'A123', 'A123', 'A123', 'A123', 'A123',
     'A123', 'A123', 'A123', 'A123', 'A123', 'A123'),
    (7, 'M007', 'P007', 'CL007', '2022-01-07',
     'J64', 'J487', 'A123', 'A123', 'A123', 'A123',
     'A123', 'A123', 'A123', 'A123', 'A123', 'A123'),
    (8, 'M008', 'P008', 'CL008', '2022-01-08',
     'A123', 'A123', 'J64', 'A123', 'A123', 'A123',
     'A123', 'A123', 'A123', 'A123', 'A123', 'A123'),
    (9, 'M009', 'P009', 'CL009', '2022-01-09',
     'J466', 'J443', 'J466', 'A123', 'A123', 'J443',
     'A123', 'A123', 'A123', 'A123', 'A123', 'A123'),
    (10, 'M010', 'P010', 'CL010', '2022-01-10',
     'J445', 'A123', 'A123', 'J445', 'A123', 'A123',
     'J445', 'A123', 'A123', 'A123', 'J445', 'A123'),
]

columns = ["ID", "MEMB_KEY", "PROV_KEY", "CL#", "DOS"] + \
    [f'DX_{str(i).zfill(2)}' for i in range(0, 12)]

df = spark.createDataFrame(data, columns)&lt;/LI-CODE&gt;&lt;LI-CODE lang="python"&gt;# Filtering lists
dx_list1 = ['J984', 'J466', 'J754', 'J64', 'J465', 'J445']
dx_list2 = ['J454', 'J445', 'J443', 'J76', 'J487', 'J765']

# Generate DX column names
dx_columns = [f'DX_{str(i).zfill(2)}' for i in range(0, 12)]  # DX_00 to DX_11

# Create pairs of columns: (DX_00, DX_01), (DX_02, DX_03), ..., (DX_10, DX_11)
dx_pairs = [(dx_columns[i], dx_columns[i+1]) for i in range(0, len(dx_columns)-1, 2)]&lt;/LI-CODE&gt;&lt;LI-CODE lang="python"&gt;from pyspark.sql.functions import col
from functools import reduce
import operator

# Build list of conditions
pair_conditions = [
    (col(dx_i).isin(dx_list1) | col(dx_j).isin(dx_list2))
    for dx_i, dx_j in dx_pairs
]

# Combine conditions with AND
final_condition = reduce(operator.and_, pair_conditions)&lt;/LI-CODE&gt;&lt;LI-CODE lang="python"&gt;filtered_df = df.filter(final_condition)

# Show the results
filtered_df.select("ID", *dx_columns).show(truncate=False)&lt;/LI-CODE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Sun, 13 Oct 2024 10:19:46 GMT</pubDate>
      <guid>https://community.databricks.com/t5/get-started-discussions/databricks-pyspark-filter-several-columns-with-similar-criteria/m-p/93694#M8554</guid>
      <dc:creator>filipniziol</dc:creator>
      <dc:date>2024-10-13T10:19:46Z</dc:date>
    </item>
  </channel>
</rss>

