<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Problem with sparkContext.parallelize and volatile functions? in Data Engineering</title>
    <link>https://community.databricks.com/t5/data-engineering/problem-with-sparkcontext-parallelize-and-volatile-functions/m-p/3930#M806</link>
    <description>&lt;P&gt;I have a code:&lt;/P&gt;&lt;PRE&gt;&lt;CODE&gt;from time import sleep
from random import random
from operator import add
&amp;nbsp;
def f(a: int) -&amp;gt; float:
    sleep(0.1)
    return random()
  
rdd1 = sc.parallelize(range(20), 2)
rdd2 = sc.parallelize(range(20), 2)
rdd3 = sc.parallelize(range(20), 2)
print('result a1:', rdd1.map(f).reduce(add))
print('result a2:', rdd2.map(f).reduce(add))
print('result a3:', rdd3.map(f).reduce(add))
print('result b3:', sum([f(a) for a in range(20)]))
print('result b3:', sum([f(a) for a in range(20)]))
print('result b3:', sum([f(a) for a in range(20)]))&lt;/CODE&gt;&lt;/PRE&gt;&lt;P&gt;sample result of it:&lt;/P&gt;&lt;PRE&gt;&lt;CODE&gt;result a1: 9.80073680418538
result a2: 9.80073680418538
result a3: 9.80073680418538
result b3: 9.219767385799257
result b3: 8.175800896981904
result b3: 9.417623482504323&lt;/CODE&gt;&lt;/PRE&gt;&lt;P&gt;May anybody explain me why results a* have the same value? In my opinion, all results lines should be different each other.&lt;/P&gt;&lt;P&gt;How to correct the code to be sure results a* are different?&lt;/P&gt;&lt;P&gt;Tested using Runtime 10 and 12. &lt;/P&gt;</description>
    <pubDate>Mon, 29 May 2023 10:14:10 GMT</pubDate>
    <dc:creator>del1000</dc:creator>
    <dc:date>2023-05-29T10:14:10Z</dc:date>
    <item>
      <title>Problem with sparkContext.parallelize and volatile functions?</title>
      <link>https://community.databricks.com/t5/data-engineering/problem-with-sparkcontext-parallelize-and-volatile-functions/m-p/3930#M806</link>
      <description>&lt;P&gt;I have a code:&lt;/P&gt;&lt;PRE&gt;&lt;CODE&gt;from time import sleep
from random import random
from operator import add
&amp;nbsp;
def f(a: int) -&amp;gt; float:
    sleep(0.1)
    return random()
  
rdd1 = sc.parallelize(range(20), 2)
rdd2 = sc.parallelize(range(20), 2)
rdd3 = sc.parallelize(range(20), 2)
print('result a1:', rdd1.map(f).reduce(add))
print('result a2:', rdd2.map(f).reduce(add))
print('result a3:', rdd3.map(f).reduce(add))
print('result b3:', sum([f(a) for a in range(20)]))
print('result b3:', sum([f(a) for a in range(20)]))
print('result b3:', sum([f(a) for a in range(20)]))&lt;/CODE&gt;&lt;/PRE&gt;&lt;P&gt;sample result of it:&lt;/P&gt;&lt;PRE&gt;&lt;CODE&gt;result a1: 9.80073680418538
result a2: 9.80073680418538
result a3: 9.80073680418538
result b3: 9.219767385799257
result b3: 8.175800896981904
result b3: 9.417623482504323&lt;/CODE&gt;&lt;/PRE&gt;&lt;P&gt;May anybody explain me why results a* have the same value? In my opinion, all results lines should be different each other.&lt;/P&gt;&lt;P&gt;How to correct the code to be sure results a* are different?&lt;/P&gt;&lt;P&gt;Tested using Runtime 10 and 12. &lt;/P&gt;</description>
      <pubDate>Mon, 29 May 2023 10:14:10 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/problem-with-sparkcontext-parallelize-and-volatile-functions/m-p/3930#M806</guid>
      <dc:creator>del1000</dc:creator>
      <dc:date>2023-05-29T10:14:10Z</dc:date>
    </item>
  </channel>
</rss>

