<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Re: reading multiple csv files using pathos.multiprocessing in Data Engineering</title>
    <link>https://community.databricks.com/t5/data-engineering/reading-multiple-csv-files-using-pathos-multiprocessing/m-p/13586#M8244</link>
    <description>&lt;P&gt;hey @Punit Chauhan​&amp;nbsp;refer this code &lt;/P&gt;&lt;PRE&gt;&lt;CODE&gt;from multiprocessing.pool import ThreadPool
pool = ThreadPool(5)
notebooks = ['dim_1', 'dim_2']
pool.map(lambda path: dbutils.notebook.run("/Test/Threading/"+path, timeout_seconds= 60, arguments={"input-data": path}),notebooks)&lt;/CODE&gt;&lt;/PRE&gt;&lt;P&gt;&lt;/P&gt;</description>
    <pubDate>Thu, 22 Dec 2022 09:34:10 GMT</pubDate>
    <dc:creator>Rishabh-Pandey</dc:creator>
    <dc:date>2022-12-22T09:34:10Z</dc:date>
    <item>
      <title>reading multiple csv files using pathos.multiprocessing</title>
      <link>https://community.databricks.com/t5/data-engineering/reading-multiple-csv-files-using-pathos-multiprocessing/m-p/13582#M8240</link>
      <description>&lt;P&gt;I'm&amp;nbsp;using&amp;nbsp;PySpark&amp;nbsp;and&amp;nbsp;Pathos&amp;nbsp;to&amp;nbsp;read&amp;nbsp;numerous&amp;nbsp;CSV&amp;nbsp;files&amp;nbsp;and&amp;nbsp;create&amp;nbsp;many&amp;nbsp;DF,&amp;nbsp;but&amp;nbsp;I&amp;nbsp;keep&amp;nbsp;getting&amp;nbsp;this&amp;nbsp;problem.&lt;/P&gt;&lt;P&gt;&lt;span class="lia-inline-image-display-wrapper" image-alt="dbx_error"&gt;&lt;img src="https://community.databricks.com/t5/image/serverpage/image-id/1706iEB1E8E8C26619A82/image-size/large?v=v2&amp;amp;px=999" role="button" title="dbx_error" alt="dbx_error" /&gt;&lt;/span&gt;code for the same:-&lt;/P&gt;&lt;P&gt;from pathos.multiprocessing import ProcessingPool&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;def readCsv(path):&lt;/P&gt;&lt;P&gt;&amp;nbsp;&amp;nbsp;return spark.read.csv(path,header=True)&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;csv_file_list = [file[0][5:] for file in dbutils.fs.ls("/databricks-datasets/COVID/coronavirusdataset/") if file[1].endswith(".csv")]&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;pool = ProcessingPool(2)&lt;/P&gt;&lt;P&gt;results = pool.map(readCsv, csv_file_list)&lt;/P&gt;</description>
      <pubDate>Thu, 14 Jul 2022 06:51:20 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/reading-multiple-csv-files-using-pathos-multiprocessing/m-p/13582#M8240</guid>
      <dc:creator>Prototype998</dc:creator>
      <dc:date>2022-07-14T06:51:20Z</dc:date>
    </item>
    <item>
      <title>Re: reading multiple csv files using pathos.multiprocessing</title>
      <link>https://community.databricks.com/t5/data-engineering/reading-multiple-csv-files-using-pathos-multiprocessing/m-p/13583#M8241</link>
      <description>&lt;P&gt;You actually don't need to filter `.csv` files like that.&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;You can use `pathGlobFilter` to do a regex match for selecting files that matches provided regular expression.&lt;/P&gt;&lt;PRE&gt;&lt;CODE&gt;df = spark.read.option("pathGlobFilter","*.csv").csv(upload_path)&lt;/CODE&gt;&lt;/PRE&gt;&lt;P&gt;&lt;/P&gt;</description>
      <pubDate>Thu, 14 Jul 2022 09:41:06 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/reading-multiple-csv-files-using-pathos-multiprocessing/m-p/13583#M8241</guid>
      <dc:creator>AmanSehgal</dc:creator>
      <dc:date>2022-07-14T09:41:06Z</dc:date>
    </item>
    <item>
      <title>Re: reading multiple csv files using pathos.multiprocessing</title>
      <link>https://community.databricks.com/t5/data-engineering/reading-multiple-csv-files-using-pathos-multiprocessing/m-p/13584#M8242</link>
      <description>&lt;P&gt;Hi @Punit Chauhan​&amp;nbsp;&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Hope all is well! Just wanted to check in if you were able to resolve your issue and would you be happy to share the solution or mark an answer as best? Else please let us know if you need more help.&amp;nbsp;&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;We'd love to hear from you.&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Thanks!&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;&lt;/P&gt;</description>
      <pubDate>Sun, 04 Sep 2022 07:00:49 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/reading-multiple-csv-files-using-pathos-multiprocessing/m-p/13584#M8242</guid>
      <dc:creator>Vidula</dc:creator>
      <dc:date>2022-09-04T07:00:49Z</dc:date>
    </item>
    <item>
      <title>Re: reading multiple csv files using pathos.multiprocessing</title>
      <link>https://community.databricks.com/t5/data-engineering/reading-multiple-csv-files-using-pathos-multiprocessing/m-p/13585#M8243</link>
      <description>&lt;P&gt;@Ajay Pandey​&amp;nbsp;@Rishabh Pandey​&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Thu, 22 Dec 2022 09:30:41 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/reading-multiple-csv-files-using-pathos-multiprocessing/m-p/13585#M8243</guid>
      <dc:creator>Prototype998</dc:creator>
      <dc:date>2022-12-22T09:30:41Z</dc:date>
    </item>
    <item>
      <title>Re: reading multiple csv files using pathos.multiprocessing</title>
      <link>https://community.databricks.com/t5/data-engineering/reading-multiple-csv-files-using-pathos-multiprocessing/m-p/13586#M8244</link>
      <description>&lt;P&gt;hey @Punit Chauhan​&amp;nbsp;refer this code &lt;/P&gt;&lt;PRE&gt;&lt;CODE&gt;from multiprocessing.pool import ThreadPool
pool = ThreadPool(5)
notebooks = ['dim_1', 'dim_2']
pool.map(lambda path: dbutils.notebook.run("/Test/Threading/"+path, timeout_seconds= 60, arguments={"input-data": path}),notebooks)&lt;/CODE&gt;&lt;/PRE&gt;&lt;P&gt;&lt;/P&gt;</description>
      <pubDate>Thu, 22 Dec 2022 09:34:10 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/reading-multiple-csv-files-using-pathos-multiprocessing/m-p/13586#M8244</guid>
      <dc:creator>Rishabh-Pandey</dc:creator>
      <dc:date>2022-12-22T09:34:10Z</dc:date>
    </item>
    <item>
      <title>Re: reading multiple csv files using pathos.multiprocessing</title>
      <link>https://community.databricks.com/t5/data-engineering/reading-multiple-csv-files-using-pathos-multiprocessing/m-p/13587#M8245</link>
      <description>&lt;P&gt;thanks @Rishabh Pandey​&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Thu, 22 Dec 2022 09:35:14 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/reading-multiple-csv-files-using-pathos-multiprocessing/m-p/13587#M8245</guid>
      <dc:creator>Prototype998</dc:creator>
      <dc:date>2022-12-22T09:35:14Z</dc:date>
    </item>
  </channel>
</rss>

