<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Re: QuantileDiscretizer not respecting NumBuckets in Data Engineering</title>
    <link>https://community.databricks.com/t5/data-engineering/quantilediscretizer-not-respecting-numbuckets/m-p/16030#M10255</link>
    <description>&lt;P&gt;Thank you.&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;What I did was:&lt;/P&gt;&lt;OL&gt;&lt;LI&gt;Apply QuntileBucketizer to Non-Zeros and specified a very small value (bottom 1%) to capture the lower bucket including zeroes.&lt;/LI&gt;&lt;/OL&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;That fixed the issue! You can define your own splits which would work as well but the splits themselves were important in this case.&lt;/P&gt;</description>
    <pubDate>Tue, 14 Sep 2021 01:19:17 GMT</pubDate>
    <dc:creator>Sam</dc:creator>
    <dc:date>2021-09-14T01:19:17Z</dc:date>
    <item>
      <title>QuantileDiscretizer not respecting NumBuckets</title>
      <link>https://community.databricks.com/t5/data-engineering/quantilediscretizer-not-respecting-numbuckets/m-p/16027#M10252</link>
      <description>&lt;P&gt;I have set numBuckets and &lt;B&gt;numBucketsArray&lt;/B&gt; for a group of columns to bin them into 5 buckets.&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Unfortunately the number of buckets does not seem to be respected across all columns even though there is variation within them.&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;I have tried setting the relativeerror to 0.&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;&lt;A href="https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.feature.QuantileDiscretizer.html" target="test_blank"&gt;https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.feature.QuantileDiscretizer.html&lt;/A&gt;&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Any idea why this is and how to solve it to force the number of buckets specified?&lt;/P&gt;</description>
      <pubDate>Thu, 02 Sep 2021 22:39:48 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/quantilediscretizer-not-respecting-numbuckets/m-p/16027#M10252</guid>
      <dc:creator>Sam</dc:creator>
      <dc:date>2021-09-02T22:39:48Z</dc:date>
    </item>
    <item>
      <title>Re: QuantileDiscretizer not respecting NumBuckets</title>
      <link>https://community.databricks.com/t5/data-engineering/quantilediscretizer-not-respecting-numbuckets/m-p/16029#M10254</link>
      <description>&lt;P&gt;QuantileDiscretizer does not guarantee the number of buckets afaik.  Depending on your data you might get less buckets than asked.&lt;/P&gt;&lt;P&gt;Bucketizer however does, but you have to define your splits.&lt;/P&gt;</description>
      <pubDate>Fri, 03 Sep 2021 13:11:52 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/quantilediscretizer-not-respecting-numbuckets/m-p/16029#M10254</guid>
      <dc:creator>-werners-</dc:creator>
      <dc:date>2021-09-03T13:11:52Z</dc:date>
    </item>
    <item>
      <title>Re: QuantileDiscretizer not respecting NumBuckets</title>
      <link>https://community.databricks.com/t5/data-engineering/quantilediscretizer-not-respecting-numbuckets/m-p/16030#M10255</link>
      <description>&lt;P&gt;Thank you.&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;What I did was:&lt;/P&gt;&lt;OL&gt;&lt;LI&gt;Apply QuntileBucketizer to Non-Zeros and specified a very small value (bottom 1%) to capture the lower bucket including zeroes.&lt;/LI&gt;&lt;/OL&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;That fixed the issue! You can define your own splits which would work as well but the splits themselves were important in this case.&lt;/P&gt;</description>
      <pubDate>Tue, 14 Sep 2021 01:19:17 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/quantilediscretizer-not-respecting-numbuckets/m-p/16030#M10255</guid>
      <dc:creator>Sam</dc:creator>
      <dc:date>2021-09-14T01:19:17Z</dc:date>
    </item>
    <item>
      <title>Re: QuantileDiscretizer not respecting NumBuckets</title>
      <link>https://community.databricks.com/t5/data-engineering/quantilediscretizer-not-respecting-numbuckets/m-p/16031#M10256</link>
      <description>&lt;P&gt;Can you explain a bit more?​&lt;/P&gt;</description>
      <pubDate>Thu, 14 Jul 2022 03:13:08 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/quantilediscretizer-not-respecting-numbuckets/m-p/16031#M10256</guid>
      <dc:creator>Hemant</dc:creator>
      <dc:date>2022-07-14T03:13:08Z</dc:date>
    </item>
  </channel>
</rss>

