data=[['x', 20220118, 'FALSE', 3],['x', 20220118, 'TRUE', 97],['x', 20220119, 'FALSE', 1],['x', 20220119, 'TRUE', 49],['Y', 20220118, 'FALSE', 100],['Y', 20220118, 'TRUE', 900],['Y', 20220119, 'FALSE', 200],['Y', 20220119, 'TRUE', 800]]
df=spark.createDataFrame(data, ['source', 'date', 'formattedMessage', 'messageCount'])
df.show()
df.createOrReplaceTempView("source_data")
data=[['x', 20220118, 'FALSE', 3],['x', 20220118, 'TRUE', 97],['x', 20220119, 'FALSE', 1],['x', 20220119, 'TRUE', 49],['Y', 20220118, 'FALSE', 100],['Y', 20220118, 'TRUE', 900],['Y', 20220119, 'FALSE', 200],['Y', 20220119, 'TRUE', 800]]
df=spark.createDataFrame(data, ['source', 'date', 'formattedMessage', 'messageCount'])
df.show()
df.createOrReplaceTempView("source_data")
spark.sql("with count_data as (select source, date, sum(messageCount) as total, count(date) from source_data group By source, date), main_data as (select * from source_data where formattedMessage='FALSE')select cd.source, cd.date, (messageCount/total)*100||'0%' as badMessages from count_data cd inner join main_data md on cd.source=md.source and cd.date=md.date").show()
**Note: 0% I used default value, round function can be used for decimal places.