<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Re: reading data from url using spark in Data Engineering</title>
    <link>https://community.databricks.com/t5/data-engineering/reading-data-from-url-using-spark/m-p/28010#M19848</link>
    <description>&lt;P&gt;Hi,&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;I face the same issue as abose with the following error: &lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;&lt;I&gt;Path does not exist: dbfs:/local_disk0/spark-9f23ed57-133e-41d5-91b2-12555d641961/userFiles-d252b3ba-499c-42c9-be48-96358357fb75/adult.csv&lt;/I&gt;&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;unfortunatly this link is dead: &lt;A href="https://forums.databricks.com/questions/10648/upload-local-files-into-dbfs-1.html" target="test_blank"&gt;https://forums.databricks.com/questions/10648/upload-local-files-into-dbfs-1.html&lt;/A&gt;&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Would it be possible to give the solution again ?&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Thanks&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;&lt;/P&gt;</description>
    <pubDate>Fri, 29 Oct 2021 11:00:31 GMT</pubDate>
    <dc:creator>RantoB</dc:creator>
    <dc:date>2021-10-29T11:00:31Z</dc:date>
    <item>
      <title>reading data from url using spark</title>
      <link>https://community.databricks.com/t5/data-engineering/reading-data-from-url-using-spark/m-p/28006#M19844</link>
      <description>&lt;P&gt;&lt;/P&gt;
&lt;P&gt;reading data form url using spark ,community edition ,got a path related error ,any suggestions please ?&lt;/P&gt;
&lt;PRE&gt;&lt;CODE&gt;url = "https://raw.githubusercontent.com/thomaspernet/data_csv_r/master/data/adult.csv"
from pyspark import SparkFiles
spark.sparkContext.addFile(url)
# sc.addFile(url)
# sqlContext = SQLContext(sc)
# df = sqlContext.read.csv(SparkFiles.get("adult.csv"), header=True, inferSchema= True) 
df = spark.read.csv(SparkFiles.get("adult.csv"), header=True, inferSchema= True)&lt;/CODE&gt;&lt;/PRE&gt;
&lt;P&gt;&lt;I&gt;&lt;/I&gt;&lt;PRE&gt;&lt;I&gt;&lt;CODE&gt;error:&lt;/CODE&gt;&lt;/I&gt;&lt;/PRE&gt;&lt;/P&gt;
&lt;P&gt;&lt;I&gt;&lt;/I&gt;&lt;PRE&gt;&lt;I&gt;&lt;CODE&gt;Path does not exist: dbfs:/local_disk0/spark-9f23ed57-133e-41d5-91b2-12555d641961/userFiles-d252b3ba-499c-42c9-be48-96358357fb75/adult.csv&lt;/CODE&gt;&lt;/I&gt;&lt;/PRE&gt;&lt;/P&gt; 
&lt;P&gt;&lt;/P&gt;</description>
      <pubDate>Fri, 12 Jul 2019 22:07:30 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/reading-data-from-url-using-spark/m-p/28006#M19844</guid>
      <dc:creator>AryaMa</dc:creator>
      <dc:date>2019-07-12T22:07:30Z</dc:date>
    </item>
    <item>
      <title>Re: reading data from url using spark</title>
      <link>https://community.databricks.com/t5/data-engineering/reading-data-from-url-using-spark/m-p/28007#M19845</link>
      <description>&lt;P&gt;&lt;/P&gt;
&lt;P&gt;Hi &lt;A href="https://users/31089/rr-5454.html" target="_blank"&gt;@rr_5454&lt;/A&gt;,&lt;/P&gt;
&lt;P&gt;You will find the answer here &lt;A target="_blank" href="https://"&gt;https://forums.databricks.com/questions/10648/upload-local-files-into-dbfs-1.html&lt;/A&gt;&lt;/P&gt;&lt;P&gt;&lt;/P&gt;
&lt;P&gt;You will have to:&lt;/P&gt;&lt;P&gt;&lt;/P&gt;
&lt;OL&gt;&lt;LI&gt;get the file to local file storage&lt;/LI&gt;&lt;LI&gt;move the file from dbfs&lt;P&gt;&lt;/P&gt;&lt;/LI&gt;&lt;LI&gt;load the file in a dataframe&lt;/LI&gt;&lt;/OL&gt;
&lt;P&gt;This is one of the possible solutions.&lt;/P&gt; 
&lt;P&gt;&lt;/P&gt;</description>
      <pubDate>Tue, 16 Jul 2019 08:21:08 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/reading-data-from-url-using-spark/m-p/28007#M19845</guid>
      <dc:creator>DonatienTessier</dc:creator>
      <dc:date>2019-07-16T08:21:08Z</dc:date>
    </item>
    <item>
      <title>Re: reading data from url using spark</title>
      <link>https://community.databricks.com/t5/data-engineering/reading-data-from-url-using-spark/m-p/28008#M19846</link>
      <description>&lt;P&gt;&lt;/P&gt;
&lt;P&gt;I face the same issue, could you provide some code for assistance? thanks&lt;/P&gt; 
&lt;P&gt;&lt;/P&gt;</description>
      <pubDate>Fri, 09 Aug 2019 00:15:45 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/reading-data-from-url-using-spark/m-p/28008#M19846</guid>
      <dc:creator>THIAM_HUATTAN</dc:creator>
      <dc:date>2019-08-09T00:15:45Z</dc:date>
    </item>
    <item>
      <title>Re: reading data from url using spark</title>
      <link>https://community.databricks.com/t5/data-engineering/reading-data-from-url-using-spark/m-p/28009#M19847</link>
      <description>&lt;P&gt;With code for anyone facing the same issue, and without moving to a different path&lt;/P&gt;&lt;PRE&gt;&lt;CODE&gt;import requests
&amp;nbsp;
CHUNK_SIZE=4096
&amp;nbsp;
with requests.get("https://raw.githubusercontent.com/suy1968/Adult.csv-Dataset/main/adult.csv", stream=True) as resp:
  if resp.ok:
    with open("/dbfs/FileStore/data/adult.csv", "wb") as f:
      for chunk in resp.iter_content(chunk_size=CHUNK_SIZE):
        f.write(chunk)
&amp;nbsp;
display(spark.read.csv("dbfs:/FileStore/data/adult.csv", header=True, inferSchema=True))&lt;/CODE&gt;&lt;/PRE&gt;&lt;P&gt;I had to use a different URL as the one in the original question was no longer available&lt;/P&gt;</description>
      <pubDate>Tue, 28 Sep 2021 19:31:54 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/reading-data-from-url-using-spark/m-p/28009#M19847</guid>
      <dc:creator>dazfuller</dc:creator>
      <dc:date>2021-09-28T19:31:54Z</dc:date>
    </item>
    <item>
      <title>Re: reading data from url using spark</title>
      <link>https://community.databricks.com/t5/data-engineering/reading-data-from-url-using-spark/m-p/28010#M19848</link>
      <description>&lt;P&gt;Hi,&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;I face the same issue as abose with the following error: &lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;&lt;I&gt;Path does not exist: dbfs:/local_disk0/spark-9f23ed57-133e-41d5-91b2-12555d641961/userFiles-d252b3ba-499c-42c9-be48-96358357fb75/adult.csv&lt;/I&gt;&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;unfortunatly this link is dead: &lt;A href="https://forums.databricks.com/questions/10648/upload-local-files-into-dbfs-1.html" target="test_blank"&gt;https://forums.databricks.com/questions/10648/upload-local-files-into-dbfs-1.html&lt;/A&gt;&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Would it be possible to give the solution again ?&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Thanks&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;&lt;/P&gt;</description>
      <pubDate>Fri, 29 Oct 2021 11:00:31 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/reading-data-from-url-using-spark/m-p/28010#M19848</guid>
      <dc:creator>RantoB</dc:creator>
      <dc:date>2021-10-29T11:00:31Z</dc:date>
    </item>
    <item>
      <title>Re: reading data from url using spark</title>
      <link>https://community.databricks.com/t5/data-engineering/reading-data-from-url-using-spark/m-p/28011#M19849</link>
      <description>&lt;P&gt;@Bertrand BURCKER​&amp;nbsp;- Try here - &lt;A href="https://web.archive.org/web/20201030194155/https://forums.databricks.com/questions/10648/upload-local-files-into-dbfs-1.html" target="test_blank"&gt;https://web.archive.org/web/20201030194155/https://forums.databricks.com/questions/10648/upload-local-files-into-dbfs-1.html&lt;/A&gt;&lt;/P&gt;</description>
      <pubDate>Fri, 29 Oct 2021 20:44:58 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/reading-data-from-url-using-spark/m-p/28011#M19849</guid>
      <dc:creator>Piper_Wilson</dc:creator>
      <dc:date>2021-10-29T20:44:58Z</dc:date>
    </item>
    <item>
      <title>Re: reading data from url using spark</title>
      <link>https://community.databricks.com/t5/data-engineering/reading-data-from-url-using-spark/m-p/28012#M19850</link>
      <description>&lt;P&gt;I got an answer there :&lt;/P&gt;&lt;P&gt;&lt;A href="https://community.databricks.com/s/question/0D73f000001YmAB/detail?s1oid=00D61000000JGc4&amp;amp;s1nid=0DB3f000000KylM&amp;amp;emkind=chatterCommentNotification&amp;amp;s1uid=0053f000000WY5K&amp;amp;emtm=1635544935994&amp;amp;fromEmail=1&amp;amp;s1ext=0" alt="https://community.databricks.com/s/question/0D73f000001YmAB/detail?s1oid=00D61000000JGc4&amp;amp;s1nid=0DB3f000000KylM&amp;amp;emkind=chatterCommentNotification&amp;amp;s1uid=0053f000000WY5K&amp;amp;emtm=1635544935994&amp;amp;fromEmail=1&amp;amp;s1ext=0" target="_blank"&gt;read csv directly from url with pyspark (databricks.com)&lt;/A&gt;&lt;/P&gt;&lt;P&gt;thanks&lt;/P&gt;</description>
      <pubDate>Tue, 02 Nov 2021 07:55:24 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/reading-data-from-url-using-spark/m-p/28012#M19850</guid>
      <dc:creator>RantoB</dc:creator>
      <dc:date>2021-11-02T07:55:24Z</dc:date>
    </item>
    <item>
      <title>Re: reading data from url using spark</title>
      <link>https://community.databricks.com/t5/data-engineering/reading-data-from-url-using-spark/m-p/28013#M19851</link>
      <description>&lt;P&gt;@Bertrand BURCKER​&amp;nbsp;- That's great! Would you be happy to mark your answer as best so that others can find it easily?&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Thanks!&lt;/P&gt;</description>
      <pubDate>Tue, 02 Nov 2021 15:44:32 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/reading-data-from-url-using-spark/m-p/28013#M19851</guid>
      <dc:creator>Anonymous</dc:creator>
      <dc:date>2021-11-02T15:44:32Z</dc:date>
    </item>
    <item>
      <title>Re: reading data from url using spark</title>
      <link>https://community.databricks.com/t5/data-engineering/reading-data-from-url-using-spark/m-p/28014#M19852</link>
      <description>&lt;P&gt;Hi ,&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;We can also read CSV directly without writing it to DBFS.&lt;/P&gt;&lt;P&gt;Scala spark Approach&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;PRE&gt;&lt;CODE&gt;import org.apache.commons.io.IOUtils // jar will be already there in spark cluster no need to worry
import java.net.URL 
&amp;nbsp;
val urlfile=new URL("https://people.sc.fsu.edu/~jburkardt/data/csv/airtravel.csv")
  val testDummyCSV = IOUtils.toString(urlfile,"UTF-8").lines.toList.toDS()
  val testcsv = spark
                .read.option("header", true)
                .option("inferSchema", true)
                .csv(testDummyCSV)&lt;/CODE&gt;&lt;/PRE&gt;&lt;PRE&gt;&lt;CODE&gt;display(testcsv)&lt;/CODE&gt;&lt;/PRE&gt;&lt;P&gt;&lt;span class="lia-inline-image-display-wrapper" image-alt="image"&gt;&lt;img src="https://community.databricks.com/t5/image/serverpage/image-id/2509i48FEEE281859CA1E/image-size/large?v=v2&amp;amp;px=999" role="button" title="image" alt="image" /&gt;&lt;/span&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;Notebook attached &lt;/P&gt;</description>
      <pubDate>Fri, 26 Nov 2021 14:14:20 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/reading-data-from-url-using-spark/m-p/28014#M19852</guid>
      <dc:creator>User16752246494</dc:creator>
      <dc:date>2021-11-26T14:14:20Z</dc:date>
    </item>
    <item>
      <title>Re: reading data from url using spark</title>
      <link>https://community.databricks.com/t5/data-engineering/reading-data-from-url-using-spark/m-p/28015#M19853</link>
      <description>&lt;P&gt;hello everyone, this issue has not been resolved until today. I appreciate all the palliative ways. But shouldn't SparkFiles be able to extract data from an API? I tested SparkFiles on Community Databricks without errors, but on Azure it generates the path not found message.&lt;/P&gt;</description>
      <pubDate>Mon, 13 Dec 2021 17:48:49 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/reading-data-from-url-using-spark/m-p/28015#M19853</guid>
      <dc:creator>weldermartins</dc:creator>
      <dc:date>2021-12-13T17:48:49Z</dc:date>
    </item>
    <item>
      <title>Re: reading data from url using spark</title>
      <link>https://community.databricks.com/t5/data-engineering/reading-data-from-url-using-spark/m-p/28016#M19854</link>
      <description>&lt;P&gt;hi,&lt;/P&gt;&lt;P&gt;does the best answer of this post help you :&lt;/P&gt;&lt;P&gt;&lt;A href="https://community.databricks.com/s/question/0D73f000001YmAB/detail?s1oid=00D61000000JGc4&amp;amp;s1nid=0DB3f000000KylM&amp;amp;emkind=chatterCommentNotification&amp;amp;s1uid=0053f000000WY5K&amp;amp;emtm=1635544935994&amp;amp;fromEmail=1&amp;amp;s1ext=0" alt="https://community.databricks.com/s/question/0D73f000001YmAB/detail?s1oid=00D61000000JGc4&amp;amp;s1nid=0DB3f000000KylM&amp;amp;emkind=chatterCommentNotification&amp;amp;s1uid=0053f000000WY5K&amp;amp;emtm=1635544935994&amp;amp;fromEmail=1&amp;amp;s1ext=0" target="_blank"&gt;read csv directly from url with pyspark (databricks.com)&lt;/A&gt; ?&lt;/P&gt;</description>
      <pubDate>Tue, 14 Dec 2021 08:04:48 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/reading-data-from-url-using-spark/m-p/28016#M19854</guid>
      <dc:creator>RantoB</dc:creator>
      <dc:date>2021-12-14T08:04:48Z</dc:date>
    </item>
    <item>
      <title>Re: reading data from url using spark</title>
      <link>https://community.databricks.com/t5/data-engineering/reading-data-from-url-using-spark/m-p/28017#M19855</link>
      <description>&lt;P&gt;Hi, the concept of functional sparkfiles I already know, functionality within Azure that is not correct.&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;The discussion is here: &lt;/P&gt;&lt;P&gt;&lt;A href="https://community.databricks.com/s/question/0D53f00001XD3pjCAD/sparkfiles-strange-behavior-on-azure-databricks-runtime-10" target="test_blank"&gt;https://community.databricks.com/s/question/0D53f00001XD3pjCAD/sparkfiles-strange-behavior-on-azure-databricks-runtime-10&lt;/A&gt;&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;&lt;/P&gt;</description>
      <pubDate>Tue, 14 Dec 2021 11:56:34 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/reading-data-from-url-using-spark/m-p/28017#M19855</guid>
      <dc:creator>weldermartins</dc:creator>
      <dc:date>2021-12-14T11:56:34Z</dc:date>
    </item>
    <item>
      <title>Re: reading data from url using spark</title>
      <link>https://community.databricks.com/t5/data-engineering/reading-data-from-url-using-spark/m-p/28018#M19856</link>
      <description>&lt;P&gt;Sorry, bringing this back up...&lt;/P&gt;&lt;P&gt;​&lt;/P&gt;&lt;PRE&gt;&lt;CODE&gt;from pyspark import SparkFiles
url = "http://raw.githubusercontent.com/ltregan/ds-data/main/authors.csv"
spark.sparkContext.addFile(url)
df = spark.read.csv("file://"+SparkFiles.get("authors.csv"), header=True, inferSchema= True)
df.show()&lt;/CODE&gt;&lt;/PRE&gt;&lt;P&gt;I get this empty output:&lt;/P&gt;&lt;PRE&gt;&lt;CODE&gt;++
||
++
++&lt;/CODE&gt;&lt;/PRE&gt;&lt;P&gt;Any idea ? Spark 3.2.2 on Mac M1&lt;/P&gt;</description>
      <pubDate>Wed, 01 Mar 2023 21:10:07 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/reading-data-from-url-using-spark/m-p/28018#M19856</guid>
      <dc:creator>padang</dc:creator>
      <dc:date>2023-03-01T21:10:07Z</dc:date>
    </item>
    <item>
      <title>Re: reading data from url using spark</title>
      <link>https://community.databricks.com/t5/data-engineering/reading-data-from-url-using-spark/m-p/39432#M26974</link>
      <description>&lt;P&gt;Hi there,&lt;BR /&gt;I have pretty much the exact code you have here, and yet it still doesnt work, saying "No such file or directory"&lt;BR /&gt;Is this a limitation of the community edition?&lt;/P&gt;&lt;LI-CODE lang="python"&gt;import requests
CHUNK_SIZE=4096
def get_remote_file(dataSrcUrl, destFile):
    '''Simple old skool python function to load a remote url into local hdfs '''
    destFile = "/dbfs" + destFile
    #
    with requests.get(dataSrcUrl, stream=True) as resp:
        if resp.ok:
            with open(destFile, "wb") as f:
                for chunk in resp.iter_content(chunk_size=CHUNK_SIZE):
                    f.write(chunk)&lt;/LI-CODE&gt;&lt;LI-CODE lang="python"&gt;get_remote_file("https://gitlab.com/opstar/share20/-/raw/master/university.json", "/Filestore/data/lgdt/university.json" )&lt;/LI-CODE&gt;&lt;P&gt;The directory "dbfs:/Filestore/data/lgdt" definitely exists as i can see it when running the dbutils.fs.ls(path) command&lt;/P&gt;</description>
      <pubDate>Wed, 09 Aug 2023 10:11:04 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/reading-data-from-url-using-spark/m-p/39432#M26974</guid>
      <dc:creator>lemfo</dc:creator>
      <dc:date>2023-08-09T10:11:04Z</dc:date>
    </item>
  </channel>
</rss>

