<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Run threadpool on multiple nodes in Data Engineering</title>
    <link>https://community.databricks.com/t5/data-engineering/run-threadpool-on-multiple-nodes/m-p/37029#M26242</link>
    <description>&lt;P&gt;I've ran a dual multiprocessing and multithreading solution in python before using the multiprocessing and concurrent futures python modules. However, since the multiprocessing module only runs on the driver node, I have to instead use sc.parallelize to distribute workload to the worker nodes. I'm having a bit of trouble and would appreciate input on troubleshooting (code below). Here, I'm just trying to split a list of 100 values across 10 worker nodes and implement threading on each of those worker nodes.&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;LI-CODE lang="python"&gt;from concurrent.futures import ThreadPoolExecutor

def multithread(task, l):
   with ThreadPoolExecutor() as executor:
      results = list(executor.map(task, l))
   return results

def square(x):
   time.sleep(1)
   return x**2

def partition(l, n):
   # this function just partitions an input list into 'n' chunks
   for i in range(0, len(l), n):
      yield l[i:i +n]

num = list(range(100))
workers = 10
chunks = list(partition(num, workers))
rdd = sc.parallelize(chunks, numSlices=workers)
results = rdd.map(lambda x: multithread(x)).collect()&lt;/LI-CODE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
    <pubDate>Wed, 05 Jul 2023 18:37:09 GMT</pubDate>
    <dc:creator>pjp94</dc:creator>
    <dc:date>2023-07-05T18:37:09Z</dc:date>
    <item>
      <title>Run threadpool on multiple nodes</title>
      <link>https://community.databricks.com/t5/data-engineering/run-threadpool-on-multiple-nodes/m-p/37029#M26242</link>
      <description>&lt;P&gt;I've ran a dual multiprocessing and multithreading solution in python before using the multiprocessing and concurrent futures python modules. However, since the multiprocessing module only runs on the driver node, I have to instead use sc.parallelize to distribute workload to the worker nodes. I'm having a bit of trouble and would appreciate input on troubleshooting (code below). Here, I'm just trying to split a list of 100 values across 10 worker nodes and implement threading on each of those worker nodes.&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;LI-CODE lang="python"&gt;from concurrent.futures import ThreadPoolExecutor

def multithread(task, l):
   with ThreadPoolExecutor() as executor:
      results = list(executor.map(task, l))
   return results

def square(x):
   time.sleep(1)
   return x**2

def partition(l, n):
   # this function just partitions an input list into 'n' chunks
   for i in range(0, len(l), n):
      yield l[i:i +n]

num = list(range(100))
workers = 10
chunks = list(partition(num, workers))
rdd = sc.parallelize(chunks, numSlices=workers)
results = rdd.map(lambda x: multithread(x)).collect()&lt;/LI-CODE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Wed, 05 Jul 2023 18:37:09 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/run-threadpool-on-multiple-nodes/m-p/37029#M26242</guid>
      <dc:creator>pjp94</dc:creator>
      <dc:date>2023-07-05T18:37:09Z</dc:date>
    </item>
  </channel>
</rss>

