<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Unable to reproduce Kmeans Clustering results even after setting seed and tolerance in Data Engineering</title>
    <link>https://community.databricks.com/t5/data-engineering/unable-to-reproduce-kmeans-clustering-results-even-after-setting/m-p/12766#M7528</link>
    <description>&lt;P&gt;Hi &lt;/P&gt;&lt;P&gt;I have been trying to reproduce Kmeans results with no luck&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Here is my code snippet:&lt;/P&gt;&lt;P&gt;from pyspark.ml.clustering import KMeans&lt;/P&gt;&lt;P&gt;KMeans(featuresCol=featuresCol, k=clusters, maxIter=40, seed=1, tol = .00001)&amp;nbsp;&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Can anyone help?&lt;/P&gt;</description>
    <pubDate>Tue, 10 Jan 2023 23:25:20 GMT</pubDate>
    <dc:creator>mala</dc:creator>
    <dc:date>2023-01-10T23:25:20Z</dc:date>
    <item>
      <title>Unable to reproduce Kmeans Clustering results even after setting seed and tolerance</title>
      <link>https://community.databricks.com/t5/data-engineering/unable-to-reproduce-kmeans-clustering-results-even-after-setting/m-p/12766#M7528</link>
      <description>&lt;P&gt;Hi &lt;/P&gt;&lt;P&gt;I have been trying to reproduce Kmeans results with no luck&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Here is my code snippet:&lt;/P&gt;&lt;P&gt;from pyspark.ml.clustering import KMeans&lt;/P&gt;&lt;P&gt;KMeans(featuresCol=featuresCol, k=clusters, maxIter=40, seed=1, tol = .00001)&amp;nbsp;&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Can anyone help?&lt;/P&gt;</description>
      <pubDate>Tue, 10 Jan 2023 23:25:20 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/unable-to-reproduce-kmeans-clustering-results-even-after-setting/m-p/12766#M7528</guid>
      <dc:creator>mala</dc:creator>
      <dc:date>2023-01-10T23:25:20Z</dc:date>
    </item>
    <item>
      <title>Re: Unable to reproduce Kmeans Clustering results even after setting seed and tolerance</title>
      <link>https://community.databricks.com/t5/data-engineering/unable-to-reproduce-kmeans-clustering-results-even-after-setting/m-p/12768#M7530</link>
      <description>&lt;P&gt;Hi Debaya&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Thanks for your reply, it runs without any issues. After rerunning the model each time, I got different cluster outputs even after applying seed and tolerance as I have mentioned in my code snippet. &lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;I would expect the results to be the same once you apply seed since it removes any randomness. I also increased the number of iterations which didn't help either. &lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Is there a way to reproduce the results in Spark?&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Thanks&lt;/P&gt;&lt;P&gt;Mala&lt;/P&gt;</description>
      <pubDate>Wed, 11 Jan 2023 21:20:20 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/unable-to-reproduce-kmeans-clustering-results-even-after-setting/m-p/12768#M7530</guid>
      <dc:creator>mala</dc:creator>
      <dc:date>2023-01-11T21:20:20Z</dc:date>
    </item>
    <item>
      <title>Re: Unable to reproduce Kmeans Clustering results even after setting seed and tolerance</title>
      <link>https://community.databricks.com/t5/data-engineering/unable-to-reproduce-kmeans-clustering-results-even-after-setting/m-p/12769#M7531</link>
      <description>&lt;P&gt;This issue was due to spark parallelization which doesn't guarantee the same data is assigned to each partition. &lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;I was able to resolve this by making sure the same data is assigned to the same partitions :&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;df.repartition(num_partitions,  "ur_col_id")&lt;/P&gt;&lt;P&gt;df.sortWithinPartitions("ur_col_id")&lt;/P&gt;</description>
      <pubDate>Thu, 19 Jan 2023 18:52:56 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/unable-to-reproduce-kmeans-clustering-results-even-after-setting/m-p/12769#M7531</guid>
      <dc:creator>mala</dc:creator>
      <dc:date>2023-01-19T18:52:56Z</dc:date>
    </item>
    <item>
      <title>Re: Unable to reproduce Kmeans Clustering results even after setting seed and tolerance</title>
      <link>https://community.databricks.com/t5/data-engineering/unable-to-reproduce-kmeans-clustering-results-even-after-setting/m-p/12767#M7529</link>
      <description>&lt;P&gt;Hi, Do you receive any errors? Please refer &lt;A href="https://www.databricks.com/tensorflow/clustering-and-k-means" target="test_blank"&gt;https://www.databricks.com/tensorflow/clustering-and-k-means&lt;/A&gt; for examples. Please let us know if this helps. &lt;/P&gt;</description>
      <pubDate>Wed, 11 Jan 2023 21:11:45 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/unable-to-reproduce-kmeans-clustering-results-even-after-setting/m-p/12767#M7529</guid>
      <dc:creator>Debayan</dc:creator>
      <dc:date>2023-01-11T21:11:45Z</dc:date>
    </item>
  </channel>
</rss>

