Hi,
I have created a python wheel with the following code. And the package name is rule_engine
"""
The entry point of the Python Wheel
"""
import sys
from pyspark.sql.functions import expr, col
def get_rules(tag):
"""
loads data quality rules from a table
:param tag: tag to match
:return: dictionary of rules that matched the tag
"""
rules = {}
df = spark.read.table("rules")
for row in df.filter(col("tag") == tag).collect():
rules[row['name']] = row['constraint']
return rules
def get_quarantine_rules(tag):
"""
loads data quality rules from a table
:param tag: tag to match
:return: dictionary of rules that matched the tag
"""
all_rules_in_tags=get_rules(tag)
qurantine_rule="NOT({0})".format(" AND ".join(all_rules_in_tags.values()))
return qurantine_rule
Now after I install it into Databricks Cluster and then import it so I can call the function defined into it.
import rule_engine
rule_dict=rule_engine.get_quarantine_rules("maintained")
It throws this error:
NameError Traceback (most recent call last)
<command-502204870200978> in <cell line: 2>()
1 import rule_engine
----> 2 rule_dict=rule_engine.get_quarantine_rules("maintained")
/local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.9/site-packages/rule_engine/functions.py in get_quarantine_rules(tag)
27 :return: dictionary of rules that matched the tag
28 """
---> 29 all_rules_in_tags=get_rules(tag)
30 qurantine_rule="NOT({0})".format(" AND ".join(all_rules_in_tags.values()))
31 return qurantine_rule
/local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.9/site-packages/rule_engine/functions.py in get_rules(tag)
15 """
16 rules = {}
---> 17 df = spark.read.table("rules")
18 for row in df.filter(col("tag") == tag).collect():
19 rules[row['name']] = row['constraint']
NameError: name 'spark' is not defined
Regards
Rajaniesh