<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Re: Databrick select from web address that returns JSON in Data Engineering</title>
    <link>https://community.databricks.com/t5/data-engineering/databrick-select-from-web-address-that-returns-json/m-p/111231#M43826</link>
    <description>&lt;P&gt;yes, you could use SQL to read the data from the volume:&lt;/P&gt;&lt;LI-CODE lang="python"&gt;%sql
SELECT * FROM json.`/Volumes/demo/raw/files/ergast/my_results.json`&lt;/LI-CODE&gt;&lt;P&gt;&lt;span class="lia-inline-image-display-wrapper lia-image-align-inline" image-alt="StefanKoch_0-1740549044779.png" style="width: 400px;"&gt;&lt;img src="https://community.databricks.com/t5/image/serverpage/image-id/15093iC40F1497F872F4CF/image-size/medium?v=v2&amp;amp;px=400" role="button" title="StefanKoch_0-1740549044779.png" alt="StefanKoch_0-1740549044779.png" /&gt;&lt;/span&gt;&lt;/P&gt;&lt;P&gt;You still have to flatten the result.&lt;/P&gt;&lt;P&gt;The thing with the shell was just an example. Wouldn't recommend that, just use python&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
    <pubDate>Wed, 26 Feb 2025 05:52:29 GMT</pubDate>
    <dc:creator>Stefan-Koch</dc:creator>
    <dc:date>2025-02-26T05:52:29Z</dc:date>
    <item>
      <title>Databrick select from web address that returns JSON</title>
      <link>https://community.databricks.com/t5/data-engineering/databrick-select-from-web-address-that-returns-json/m-p/111187#M43822</link>
      <description>&lt;DIV&gt;&lt;SPAN&gt;Hi I'm in a data bricks notebook and want to select from a web site that returns json.&amp;nbsp;&lt;/SPAN&gt;&lt;/DIV&gt;&lt;DIV&gt;&amp;nbsp;&lt;/DIV&gt;&lt;DIV&gt;&lt;SPAN&gt;For example this web site &lt;SPAN class=""&gt;&lt;A href="http://ergast.com/api/f1/2004/1/results.json" target="_blank" rel="noopener"&gt;http://ergast.com/api/f1/2004/1/results.json&lt;/A&gt;&lt;/SPAN&gt;&lt;/SPAN&gt;&lt;/DIV&gt;&lt;DIV&gt;&lt;SPAN&gt;will return some JSON. (example only)&lt;/SPAN&gt;&lt;/DIV&gt;&lt;DIV&gt;&amp;nbsp;&lt;/DIV&gt;&lt;DIV&gt;&lt;SPAN&gt;Can i do the following or Do you need to use python. (please give an example of py code if so)&lt;/SPAN&gt;&lt;/DIV&gt;&lt;DIV&gt;&lt;SPAN&gt;Will DB notebooks allow you to do this with out opening anything up.&lt;/SPAN&gt;&lt;/DIV&gt;&lt;DIV&gt;&lt;BR /&gt;&lt;PRE&gt;&lt;SPAN class=""&gt;&lt;SPAN&gt;%sql&lt;BR /&gt;select&lt;/SPAN&gt; &lt;SPAN&gt;*&lt;/SPAN&gt; &lt;SPAN&gt;from&lt;/SPAN&gt; &lt;SPAN&gt;json.&lt;/SPAN&gt;`http://ergast.com/api/f1/2004/1/results.json`&lt;/SPAN&gt;&lt;/PRE&gt;&lt;SPAN&gt;`&lt;/SPAN&gt;&lt;/DIV&gt;</description>
      <pubDate>Wed, 26 Feb 2025 02:49:57 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/databrick-select-from-web-address-that-returns-json/m-p/111187#M43822</guid>
      <dc:creator>ozmike</dc:creator>
      <dc:date>2025-02-26T02:49:57Z</dc:date>
    </item>
    <item>
      <title>Re: Databrick select from web address that returns JSON</title>
      <link>https://community.databricks.com/t5/data-engineering/databrick-select-from-web-address-that-returns-json/m-p/111225#M43823</link>
      <description>&lt;P&gt;Hi&amp;nbsp;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/150843"&gt;@ozmike&lt;/a&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;You could do this as example with wget on shell:&amp;nbsp;&lt;/P&gt;&lt;LI-CODE lang="python"&gt;%sh
wget -P /Volumes/demo/raw/files/ergast http://ergast.com/api/f1/2004/1/results.json&lt;/LI-CODE&gt;&lt;P&gt;Then you can access the json file in a volume.&lt;/P&gt;&lt;P&gt;&lt;span class="lia-inline-image-display-wrapper lia-image-align-inline" image-alt="StefanKoch_0-1740547550570.png" style="width: 400px;"&gt;&lt;img src="https://community.databricks.com/t5/image/serverpage/image-id/15087i05C341D676662A6F/image-size/medium?v=v2&amp;amp;px=400" role="button" title="StefanKoch_0-1740547550570.png" alt="StefanKoch_0-1740547550570.png" /&gt;&lt;/span&gt;&lt;/P&gt;&lt;P&gt;The better way is to use python:&lt;/P&gt;&lt;LI-CODE lang="python"&gt;%pip install wget

import wget

url = "http://ergast.com/api/f1/2004/1/results.json"
destination_path = "/Volumes/demo/raw/files/my_results.json"
wget.download(url, out=destination_path)&lt;/LI-CODE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;Then you see the content again in the volume:&lt;/P&gt;&lt;P&gt;&lt;span class="lia-inline-image-display-wrapper lia-image-align-inline" image-alt="StefanKoch_1-1740547720860.png" style="width: 400px;"&gt;&lt;img src="https://community.databricks.com/t5/image/serverpage/image-id/15088iEBA88EBED07189B4/image-size/medium?v=v2&amp;amp;px=400" role="button" title="StefanKoch_1-1740547720860.png" alt="StefanKoch_1-1740547720860.png" /&gt;&lt;/span&gt;&lt;/P&gt;&lt;P&gt;Read the json content into a dataframe:&lt;/P&gt;&lt;LI-CODE lang="python"&gt;df = spark.read.json(destination_path)
display(df)&lt;/LI-CODE&gt;&lt;P&gt;&lt;span class="lia-inline-image-display-wrapper lia-image-align-inline" image-alt="StefanKoch_2-1740547802740.png" style="width: 400px;"&gt;&lt;img src="https://community.databricks.com/t5/image/serverpage/image-id/15089i7D7771538A046455/image-size/medium?v=v2&amp;amp;px=400" role="button" title="StefanKoch_2-1740547802740.png" alt="StefanKoch_2-1740547802740.png" /&gt;&lt;/span&gt;&lt;/P&gt;&lt;P&gt;You may want to flatten the strucure:&lt;/P&gt;&lt;LI-CODE lang="python"&gt;from pyspark.sql.functions import col, explode

df_flat = df.select(
    col("MRData.RaceTable.Races.season").alias("season"),
    col("MRData.RaceTable.Races.round").alias("round"),
    col("MRData.RaceTable.Races.url").alias("race_url"),
    col("MRData.RaceTable.Races.raceName").alias("race_name"),
    col("MRData.RaceTable.Races.Circuit.circuitId").alias("circuit_id"),
    col("MRData.RaceTable.Races.Circuit.url").alias("circuit_url"),
    col("MRData.RaceTable.Races.Circuit.circuitName").alias("circuit_name"),
    col("MRData.RaceTable.Races.Circuit.Location.lat").alias("circuit_lat"),
    col("MRData.RaceTable.Races.Circuit.Location.long").alias("circuit_long"),
    col("MRData.RaceTable.Races.Circuit.Location.locality").alias("circuit_locality"),
    col("MRData.RaceTable.Races.Circuit.Location.country").alias("circuit_country"),
    explode("MRData.RaceTable.Races.Results").alias("result")
).select(
    col("season"),
    col("round"),
    col("race_url"),
    col("race_name"),
    col("circuit_id"),
    col("circuit_url"),
    col("circuit_name"),
    col("circuit_lat"),
    col("circuit_long"),
    col("circuit_locality"),
    col("circuit_country"),
    col("result.number").alias("driver_number"),
    col("result.position").alias("driver_position"),
    col("result.positionText").alias("driver_position_text"),
    col("result.points").alias("driver_points"),
    col("result.Driver.driverId").alias("driver_id"),
    col("result.Driver.permanentNumber").alias("driver_permanent_number"),
    col("result.Driver.code").alias("driver_code"),
    col("result.Driver.url").alias("driver_url"),
    col("result.Driver.givenName").alias("driver_given_name"),
    col("result.Driver.familyName").alias("driver_family_name"),
    col("result.Driver.dateOfBirth").alias("driver_dob"),
    col("result.Driver.nationality").alias("driver_nationality"),
    col("result.Constructor.constructorId").alias("constructor_id"),
    col("result.Constructor.url").alias("constructor_url"),
    col("result.Constructor.name").alias("constructor_name"),
    col("result.Constructor.nationality").alias("constructor_nationality"),
    col("result.grid").alias("grid_position"),
    col("result.laps").alias("laps"),
    col("result.status").alias("status")
)

display(df_flat)&lt;/LI-CODE&gt;&lt;P&gt;&lt;span class="lia-inline-image-display-wrapper lia-image-align-inline" image-alt="StefanKoch_3-1740547911744.png" style="width: 400px;"&gt;&lt;img src="https://community.databricks.com/t5/image/serverpage/image-id/15090i89A8AB594A398E4D/image-size/medium?v=v2&amp;amp;px=400" role="button" title="StefanKoch_3-1740547911744.png" alt="StefanKoch_3-1740547911744.png" /&gt;&lt;/span&gt;&lt;/P&gt;&lt;P&gt;May, you adapt the script and dynamically flatten the structure.&lt;/P&gt;</description>
      <pubDate>Wed, 26 Feb 2025 05:33:01 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/databrick-select-from-web-address-that-returns-json/m-p/111225#M43823</guid>
      <dc:creator>Stefan-Koch</dc:creator>
      <dc:date>2025-02-26T05:33:01Z</dc:date>
    </item>
    <item>
      <title>Re: Databrick select from web address that returns JSON</title>
      <link>https://community.databricks.com/t5/data-engineering/databrick-select-from-web-address-that-returns-json/m-p/111226#M43824</link>
      <description>&lt;P&gt;Hi Thanks&lt;BR /&gt;Can you do a select from this volume you have created ( from SQL not PY )?&lt;/P&gt;&lt;P&gt;SELECT * FROM JSON.'volume:xyz'&lt;/P&gt;&lt;P&gt;Also how do you get&amp;nbsp; shell&amp;nbsp; access from the GUI ? or - is this an admin thing.&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Wed, 26 Feb 2025 05:42:09 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/databrick-select-from-web-address-that-returns-json/m-p/111226#M43824</guid>
      <dc:creator>ozmike</dc:creator>
      <dc:date>2025-02-26T05:42:09Z</dc:date>
    </item>
    <item>
      <title>Re: Databrick select from web address that returns JSON</title>
      <link>https://community.databricks.com/t5/data-engineering/databrick-select-from-web-address-that-returns-json/m-p/111231#M43826</link>
      <description>&lt;P&gt;yes, you could use SQL to read the data from the volume:&lt;/P&gt;&lt;LI-CODE lang="python"&gt;%sql
SELECT * FROM json.`/Volumes/demo/raw/files/ergast/my_results.json`&lt;/LI-CODE&gt;&lt;P&gt;&lt;span class="lia-inline-image-display-wrapper lia-image-align-inline" image-alt="StefanKoch_0-1740549044779.png" style="width: 400px;"&gt;&lt;img src="https://community.databricks.com/t5/image/serverpage/image-id/15093iC40F1497F872F4CF/image-size/medium?v=v2&amp;amp;px=400" role="button" title="StefanKoch_0-1740549044779.png" alt="StefanKoch_0-1740549044779.png" /&gt;&lt;/span&gt;&lt;/P&gt;&lt;P&gt;You still have to flatten the result.&lt;/P&gt;&lt;P&gt;The thing with the shell was just an example. Wouldn't recommend that, just use python&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Wed, 26 Feb 2025 05:52:29 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/databrick-select-from-web-address-that-returns-json/m-p/111231#M43826</guid>
      <dc:creator>Stefan-Koch</dc:creator>
      <dc:date>2025-02-26T05:52:29Z</dc:date>
    </item>
  </channel>
</rss>

