<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Cannot use Databricks ARC as demo code in Machine Learning</title>
    <link>https://community.databricks.com/t5/machine-learning/cannot-use-databricks-arc-as-demo-code/m-p/77060#M3404</link>
    <description>&lt;P&gt;I read the link about Databricks ARC -&amp;nbsp;&lt;A href="https://github.com/databricks-industry-solutions/auto-data-linkage" target="_blank"&gt;https://github.com/databricks-industry-solutions/auto-data-linkage&lt;/A&gt;&lt;/P&gt;&lt;P&gt;and run on DBR 12.2 LTS ML runtime environment on DB cloud community&lt;/P&gt;&lt;P&gt;But I got the error below:&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;LI-CODE lang="markup"&gt;2024/07/08 04:25:33 INFO mlflow.tracking.fluent: Experiment with name '/Users/ha@infinitelambda.com/Databricks Autolinker 2024-07-08 04:25:33.405046' does not exist. Creating a new experiment.
IndexError: list index out of range
---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
&amp;lt;command-3969223619542297&amp;gt; in &amp;lt;module&amp;gt;
     17 # )
     18 
---&amp;gt; 19 autolinker.auto_link(
     20   data=data_df,
     21   attribute_columns=attribute_columns,

/local_disk0/.ephemeral_nfs/envs/pythonEnv-a67c339d-1de3-4b7b-8a4e-766134e34824/lib/python3.9/site-packages/arc/autolinker/autolinker.py in auto_link(self, data, attribute_columns, unique_id, comparison_size_limit, max_evals, cleaning, threshold, true_label, random_seed, metric, sample_for_blocking_rules)
    803     self.spark.conf.set("spark.databricks.optimizer.adaptive.enabled", 'False')
    804     if self.linker_mode == "dedupe_only":
--&amp;gt; 805       space = self._create_hyperopt_space(self._autolink_data, self.attribute_columns, comparison_size_limit, sample_for_blocking_rules)
    806     else:
    807       # use the larger dataframe as baseline

/local_disk0/.ephemeral_nfs/envs/pythonEnv-a67c339d-1de3-4b7b-8a4e-766134e34824/lib/python3.9/site-packages/arc/autolinker/autolinker.py in _create_hyperopt_space(self, data, attribute_columns, comparison_size_limit, sample_for_blocking_rules, max_columns_per_and_rule, max_rules_per_or_rule)
    329 
    330     # Generate candidate blocking rules
--&amp;gt; 331     self.blocking_rules = self._generate_candidate_blocking_rules(
    332       data=data,
    333       attribute_columns=attribute_columns,

/local_disk0/.ephemeral_nfs/envs/pythonEnv-a67c339d-1de3-4b7b-8a4e-766134e34824/lib/python3.9/site-packages/arc/autolinker/autolinker.py in _generate_candidate_blocking_rules(self, data, attribute_columns, comparison_size_limit, sample_for_blocking_rules, max_columns_per_and_rule, max_rules_per_or_rule)
    296 
    297     # set deterministic rules to be 500th largest (or largest) blocking rule
--&amp;gt; 298     self.deterministic_columns = df_rules.orderBy(F.col("rule_squared_count")).limit(500).orderBy(F.col("rule_squared_count").desc()).limit(1).collect()[0]["splink_rule"]
    299 
    300     df_rules.unpersist()

IndexError: list index out of range&lt;/LI-CODE&gt;&lt;P&gt;&amp;nbsp;Thanks in advance&lt;/P&gt;</description>
    <pubDate>Mon, 08 Jul 2024 04:37:06 GMT</pubDate>
    <dc:creator>hadoan</dc:creator>
    <dc:date>2024-07-08T04:37:06Z</dc:date>
    <item>
      <title>Cannot use Databricks ARC as demo code</title>
      <link>https://community.databricks.com/t5/machine-learning/cannot-use-databricks-arc-as-demo-code/m-p/77060#M3404</link>
      <description>&lt;P&gt;I read the link about Databricks ARC -&amp;nbsp;&lt;A href="https://github.com/databricks-industry-solutions/auto-data-linkage" target="_blank"&gt;https://github.com/databricks-industry-solutions/auto-data-linkage&lt;/A&gt;&lt;/P&gt;&lt;P&gt;and run on DBR 12.2 LTS ML runtime environment on DB cloud community&lt;/P&gt;&lt;P&gt;But I got the error below:&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;LI-CODE lang="markup"&gt;2024/07/08 04:25:33 INFO mlflow.tracking.fluent: Experiment with name '/Users/ha@infinitelambda.com/Databricks Autolinker 2024-07-08 04:25:33.405046' does not exist. Creating a new experiment.
IndexError: list index out of range
---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
&amp;lt;command-3969223619542297&amp;gt; in &amp;lt;module&amp;gt;
     17 # )
     18 
---&amp;gt; 19 autolinker.auto_link(
     20   data=data_df,
     21   attribute_columns=attribute_columns,

/local_disk0/.ephemeral_nfs/envs/pythonEnv-a67c339d-1de3-4b7b-8a4e-766134e34824/lib/python3.9/site-packages/arc/autolinker/autolinker.py in auto_link(self, data, attribute_columns, unique_id, comparison_size_limit, max_evals, cleaning, threshold, true_label, random_seed, metric, sample_for_blocking_rules)
    803     self.spark.conf.set("spark.databricks.optimizer.adaptive.enabled", 'False')
    804     if self.linker_mode == "dedupe_only":
--&amp;gt; 805       space = self._create_hyperopt_space(self._autolink_data, self.attribute_columns, comparison_size_limit, sample_for_blocking_rules)
    806     else:
    807       # use the larger dataframe as baseline

/local_disk0/.ephemeral_nfs/envs/pythonEnv-a67c339d-1de3-4b7b-8a4e-766134e34824/lib/python3.9/site-packages/arc/autolinker/autolinker.py in _create_hyperopt_space(self, data, attribute_columns, comparison_size_limit, sample_for_blocking_rules, max_columns_per_and_rule, max_rules_per_or_rule)
    329 
    330     # Generate candidate blocking rules
--&amp;gt; 331     self.blocking_rules = self._generate_candidate_blocking_rules(
    332       data=data,
    333       attribute_columns=attribute_columns,

/local_disk0/.ephemeral_nfs/envs/pythonEnv-a67c339d-1de3-4b7b-8a4e-766134e34824/lib/python3.9/site-packages/arc/autolinker/autolinker.py in _generate_candidate_blocking_rules(self, data, attribute_columns, comparison_size_limit, sample_for_blocking_rules, max_columns_per_and_rule, max_rules_per_or_rule)
    296 
    297     # set deterministic rules to be 500th largest (or largest) blocking rule
--&amp;gt; 298     self.deterministic_columns = df_rules.orderBy(F.col("rule_squared_count")).limit(500).orderBy(F.col("rule_squared_count").desc()).limit(1).collect()[0]["splink_rule"]
    299 
    300     df_rules.unpersist()

IndexError: list index out of range&lt;/LI-CODE&gt;&lt;P&gt;&amp;nbsp;Thanks in advance&lt;/P&gt;</description>
      <pubDate>Mon, 08 Jul 2024 04:37:06 GMT</pubDate>
      <guid>https://community.databricks.com/t5/machine-learning/cannot-use-databricks-arc-as-demo-code/m-p/77060#M3404</guid>
      <dc:creator>hadoan</dc:creator>
      <dc:date>2024-07-08T04:37:06Z</dc:date>
    </item>
  </channel>
</rss>

