<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Re: Upset Plot in Databricks in Data Engineering</title>
    <link>https://community.databricks.com/t5/data-engineering/upset-plot-in-databricks/m-p/94989#M39030</link>
    <description>&lt;P&gt;Hi&amp;nbsp;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/128036"&gt;@PJ11&lt;/a&gt;&amp;nbsp;,&lt;BR /&gt;&lt;BR /&gt;&lt;A href="https://upsetplot.readthedocs.io/en/stable/formats.html" target="_self"&gt;As per documentation&lt;/A&gt;:&amp;nbsp;&lt;/P&gt;&lt;P&gt;&lt;EM&gt;UpSetPlot internally works with data based&lt;STRONG&gt; on Pandas data structures&lt;/STRONG&gt;: a Series when all you care about is counts, or a DataFrame when you’re interested in visualising additional properties of the data, such as with the UpSet.add_catplot method.&lt;/EM&gt;&lt;/P&gt;&lt;P&gt;&lt;EM&gt;UpSetPlot expects the Series or DataFrame to have a MultiIndex as input, with this index being an indicator matrix. Specifically, each category is a level in the pandas.MultiIndex with boolean values.&lt;BR /&gt;&lt;BR /&gt;&lt;/EM&gt;Make sure to convert your your spark dataframes to pandas dataframes.&lt;/P&gt;&lt;LI-CODE lang="markup"&gt;df_pandas = df.toPandas()&lt;/LI-CODE&gt;&lt;P&gt;&lt;BR /&gt;Please check below end-to-end example, that includes:&lt;/P&gt;&lt;P&gt;1. Initial data preparation (to simulate some data you can read from a table)&lt;/P&gt;&lt;LI-CODE lang="python"&gt;# Import necessary libraries
import pandas as pd
from upsetplot import UpSet
import matplotlib.pyplot as plt

from pyspark.sql.functions import explode, array, lit, col
from pyspark.sql.types import BooleanType

# Simulate data for the Spark DataFrame
data = [
    (1, False, False, False, False),
    (2, False, True, False, False),
    (3, False, True, True, False),
    (4, False, False, True, False),
    (5, True, False, False, False),
    (6, True, True, False, False),
    (7, False, False, False, True),
    (8, True, False, True, False),
    (9, True, True, True, False),
    (10, False, True, False, True),
    (11, False, False, True, True),
    (12, False, True, True, True),
    (13, True, False, False, True),
    (14, True, True, False, True),
    (15, True, True, True, True),
    (16, True, False, True, True),
]

counts = [169, 159, 110, 108, 84, 69, 49, 46, 43, 35, 32, 31, 22, 20, 14, 9]

# Create a Spark DataFrame
columns = ['member_id', 'Feature_A', 'Feature_B', 'Feature_C', 'Feature_D']
df_spark = spark.createDataFrame([dict(zip(columns, row)) for row in data])

# Create a list of counts as an array column
df_with_counts = df_spark.withColumn('count', lit(0))  # Placeholder column

for i, count in enumerate(counts):
    df_with_counts = df_with_counts.union(
        df_spark.filter(col('member_id') == data[i][0])
        .withColumn('count', array([lit(1)] * count))
        .withColumn('count', explode(col('count')))
    )

# Remove the placeholder rows
df_with_counts = df_with_counts.filter(col('count') != 0)

# Drop 'member_id' column
df_transformed = df_with_counts.drop('member_id', 'count')

for col_name in ['Feature_A', 'Feature_B', 'Feature_C', 'Feature_D']:
    df_transformed = df_transformed.withColumn(col_name, col(col_name).cast(BooleanType()))

display(df_transformed)&lt;/LI-CODE&gt;&lt;P&gt;&lt;SPAN&gt;2. Conversions to pandas objects and then creation of the upset plot&lt;/SPAN&gt;&lt;/P&gt;&lt;LI-CODE lang="python"&gt;# Convert the Spark DataFrame to a pandas DataFrame
df_pandas = df_transformed.toPandas()

# Group by all feature columns and count the occurrences
df_counts = df_pandas.groupby(['Feature_A', 'Feature_B', 'Feature_C', 'Feature_D']).size().reset_index(name='count')

# Create a MultiIndex from the feature columns
index = pd.MultiIndex.from_frame(df_counts[['Feature_A', 'Feature_B', 'Feature_C', 'Feature_D']])

# Create a Series with counts, using the MultiIndex
data_counts = pd.Series(df_counts['count'].values, index=index)

# Display the Series (optional)
print("Series with MultiIndex:")
display(data_counts)

upset = UpSet(
    data_counts,
    subset_size='auto',          # Use the counts provided in the Series
    show_counts='%d',            # Display counts on top of bars
    sort_by='cardinality',       # Sort subsets by counts
    sort_categories_by='-input',
    facecolor='blue',            # Set bar color to blue
)

# Generate the plot
fig = plt.figure(figsize=(12, 8))
upset.plot(fig=fig)

# Display the plot
plt.show()&lt;/LI-CODE&gt;&lt;P&gt;Final output:&lt;/P&gt;&lt;P&gt;&lt;span class="lia-inline-image-display-wrapper lia-image-align-inline" image-alt="filipniziol_0-1729325359814.png" style="width: 400px;"&gt;&lt;img src="https://community.databricks.com/t5/image/serverpage/image-id/12083i0375D4ADB064E7A0/image-size/medium?v=v2&amp;amp;px=400" role="button" title="filipniziol_0-1729325359814.png" alt="filipniziol_0-1729325359814.png" /&gt;&lt;/span&gt;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
    <pubDate>Sat, 19 Oct 2024 08:09:32 GMT</pubDate>
    <dc:creator>filipniziol</dc:creator>
    <dc:date>2024-10-19T08:09:32Z</dc:date>
    <item>
      <title>Upset Plot in Databricks</title>
      <link>https://community.databricks.com/t5/data-engineering/upset-plot-in-databricks/m-p/94983#M39028</link>
      <description>&lt;P&gt;I am trying to create an Upset Plot using following code, but my output is not as expected. See Image1: Output which I am getting vs Image2: Output expected. Where the total count of overlap is displayed at the top of each bar, Bar size is proportionate with the count. it's in descending order. Please help me with this. Thank you.&lt;/P&gt;&lt;P&gt;The code is&amp;nbsp;&lt;/P&gt;&lt;P&gt;import pandas as pd&lt;/P&gt;&lt;P&gt;pip install upsetplot&lt;/P&gt;&lt;P&gt;from upsetplot import UpSet&lt;/P&gt;&lt;P&gt;import matplotlib.pyplot as plt&lt;/P&gt;&lt;P&gt;df_spark=spark.table("hive_metastore.sndbx_cx.q3_data")&lt;/P&gt;&lt;P&gt;&lt;BR /&gt;df_spark.show(5)&lt;/P&gt;&lt;P&gt;df=df_spark.toPandas()&lt;/P&gt;&lt;P&gt;df=df.drop(columns=['member_id'])&lt;/P&gt;&lt;P&gt;df_counts=df_multi.value_counts()&lt;/P&gt;&lt;P&gt;df=df.astype(bool)&lt;/P&gt;&lt;P&gt;from upsetplot import UpSet&lt;/P&gt;&lt;P&gt;df_multiindex=df.groupby(list(df.columns)).size()&lt;/P&gt;&lt;P&gt;fig=plt.figure(figsize=(15,10),dpi=150)&lt;/P&gt;&lt;P&gt;upset=UpSet(df_multiindex,subset_size='count',show_counts=True,min_subset_size=1,max_subset_size=15)&lt;BR /&gt;upset.plot()&lt;/P&gt;&lt;P&gt;plt.show()&lt;/P&gt;</description>
      <pubDate>Sat, 19 Oct 2024 06:08:25 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/upset-plot-in-databricks/m-p/94983#M39028</guid>
      <dc:creator>PJ11</dc:creator>
      <dc:date>2024-10-19T06:08:25Z</dc:date>
    </item>
    <item>
      <title>Re: Upset Plot in Databricks</title>
      <link>https://community.databricks.com/t5/data-engineering/upset-plot-in-databricks/m-p/94989#M39030</link>
      <description>&lt;P&gt;Hi&amp;nbsp;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/128036"&gt;@PJ11&lt;/a&gt;&amp;nbsp;,&lt;BR /&gt;&lt;BR /&gt;&lt;A href="https://upsetplot.readthedocs.io/en/stable/formats.html" target="_self"&gt;As per documentation&lt;/A&gt;:&amp;nbsp;&lt;/P&gt;&lt;P&gt;&lt;EM&gt;UpSetPlot internally works with data based&lt;STRONG&gt; on Pandas data structures&lt;/STRONG&gt;: a Series when all you care about is counts, or a DataFrame when you’re interested in visualising additional properties of the data, such as with the UpSet.add_catplot method.&lt;/EM&gt;&lt;/P&gt;&lt;P&gt;&lt;EM&gt;UpSetPlot expects the Series or DataFrame to have a MultiIndex as input, with this index being an indicator matrix. Specifically, each category is a level in the pandas.MultiIndex with boolean values.&lt;BR /&gt;&lt;BR /&gt;&lt;/EM&gt;Make sure to convert your your spark dataframes to pandas dataframes.&lt;/P&gt;&lt;LI-CODE lang="markup"&gt;df_pandas = df.toPandas()&lt;/LI-CODE&gt;&lt;P&gt;&lt;BR /&gt;Please check below end-to-end example, that includes:&lt;/P&gt;&lt;P&gt;1. Initial data preparation (to simulate some data you can read from a table)&lt;/P&gt;&lt;LI-CODE lang="python"&gt;# Import necessary libraries
import pandas as pd
from upsetplot import UpSet
import matplotlib.pyplot as plt

from pyspark.sql.functions import explode, array, lit, col
from pyspark.sql.types import BooleanType

# Simulate data for the Spark DataFrame
data = [
    (1, False, False, False, False),
    (2, False, True, False, False),
    (3, False, True, True, False),
    (4, False, False, True, False),
    (5, True, False, False, False),
    (6, True, True, False, False),
    (7, False, False, False, True),
    (8, True, False, True, False),
    (9, True, True, True, False),
    (10, False, True, False, True),
    (11, False, False, True, True),
    (12, False, True, True, True),
    (13, True, False, False, True),
    (14, True, True, False, True),
    (15, True, True, True, True),
    (16, True, False, True, True),
]

counts = [169, 159, 110, 108, 84, 69, 49, 46, 43, 35, 32, 31, 22, 20, 14, 9]

# Create a Spark DataFrame
columns = ['member_id', 'Feature_A', 'Feature_B', 'Feature_C', 'Feature_D']
df_spark = spark.createDataFrame([dict(zip(columns, row)) for row in data])

# Create a list of counts as an array column
df_with_counts = df_spark.withColumn('count', lit(0))  # Placeholder column

for i, count in enumerate(counts):
    df_with_counts = df_with_counts.union(
        df_spark.filter(col('member_id') == data[i][0])
        .withColumn('count', array([lit(1)] * count))
        .withColumn('count', explode(col('count')))
    )

# Remove the placeholder rows
df_with_counts = df_with_counts.filter(col('count') != 0)

# Drop 'member_id' column
df_transformed = df_with_counts.drop('member_id', 'count')

for col_name in ['Feature_A', 'Feature_B', 'Feature_C', 'Feature_D']:
    df_transformed = df_transformed.withColumn(col_name, col(col_name).cast(BooleanType()))

display(df_transformed)&lt;/LI-CODE&gt;&lt;P&gt;&lt;SPAN&gt;2. Conversions to pandas objects and then creation of the upset plot&lt;/SPAN&gt;&lt;/P&gt;&lt;LI-CODE lang="python"&gt;# Convert the Spark DataFrame to a pandas DataFrame
df_pandas = df_transformed.toPandas()

# Group by all feature columns and count the occurrences
df_counts = df_pandas.groupby(['Feature_A', 'Feature_B', 'Feature_C', 'Feature_D']).size().reset_index(name='count')

# Create a MultiIndex from the feature columns
index = pd.MultiIndex.from_frame(df_counts[['Feature_A', 'Feature_B', 'Feature_C', 'Feature_D']])

# Create a Series with counts, using the MultiIndex
data_counts = pd.Series(df_counts['count'].values, index=index)

# Display the Series (optional)
print("Series with MultiIndex:")
display(data_counts)

upset = UpSet(
    data_counts,
    subset_size='auto',          # Use the counts provided in the Series
    show_counts='%d',            # Display counts on top of bars
    sort_by='cardinality',       # Sort subsets by counts
    sort_categories_by='-input',
    facecolor='blue',            # Set bar color to blue
)

# Generate the plot
fig = plt.figure(figsize=(12, 8))
upset.plot(fig=fig)

# Display the plot
plt.show()&lt;/LI-CODE&gt;&lt;P&gt;Final output:&lt;/P&gt;&lt;P&gt;&lt;span class="lia-inline-image-display-wrapper lia-image-align-inline" image-alt="filipniziol_0-1729325359814.png" style="width: 400px;"&gt;&lt;img src="https://community.databricks.com/t5/image/serverpage/image-id/12083i0375D4ADB064E7A0/image-size/medium?v=v2&amp;amp;px=400" role="button" title="filipniziol_0-1729325359814.png" alt="filipniziol_0-1729325359814.png" /&gt;&lt;/span&gt;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Sat, 19 Oct 2024 08:09:32 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/upset-plot-in-databricks/m-p/94989#M39030</guid>
      <dc:creator>filipniziol</dc:creator>
      <dc:date>2024-10-19T08:09:32Z</dc:date>
    </item>
  </channel>
</rss>

