<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Re: CUDA out of memory in Machine Learning</title>
    <link>https://community.databricks.com/t5/machine-learning/cuda-out-of-memory/m-p/38251#M1991</link>
    <description>&lt;P&gt;Hi&amp;nbsp;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/49718"&gt;@gary7135&lt;/a&gt;,&lt;/P&gt;&lt;P&gt;&lt;SPAN&gt;If you are now following the same as GitHub.&lt;/SPAN&gt;You may somehow work on how to point out the configuration&amp;nbsp;&lt;SPAN&gt;fp16=True to your file,&lt;/SPAN&gt;&lt;/P&gt;</description>
    <pubDate>Mon, 24 Jul 2023 07:44:21 GMT</pubDate>
    <dc:creator>Kumaran</dc:creator>
    <dc:date>2023-07-24T07:44:21Z</dc:date>
    <item>
      <title>CUDA out of memory</title>
      <link>https://community.databricks.com/t5/machine-learning/cuda-out-of-memory/m-p/38052#M1974</link>
      <description>&lt;P&gt;I am trying out the new Meta LLama2 model.&lt;/P&gt;&lt;P&gt;Following the databricks provided notebook example:&amp;nbsp;&lt;A href="https://github.com/databricks/databricks-ml-examples/blob/master/llm-models/llamav2/llamav2-13b/01_load_inference.py" target="_blank" rel="noopener"&gt;https://github.com/databricks/databricks-ml-examples/blob/master/llm-models/llamav2/llamav2-13b/01_load_inference.py&lt;/A&gt;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;I keep getting CUDA out of memory. My GPU cluster runtime is&amp;nbsp;&lt;/P&gt;&lt;DIV class=""&gt;&lt;DIV class=""&gt;&lt;SPAN class=""&gt;13.2 ML (includes Apache Spark 3.4.0, GPU, Scala 2.12), with 256GB memory and 1 GPU&lt;/SPAN&gt;&lt;/DIV&gt;&lt;/DIV&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;Error message:&lt;/P&gt;&lt;P&gt;&lt;SPAN&gt;CUDA out of memory. Tried to allocate 314.00 MiB (GPU 0; 14.76 GiB total capacity; 13.50 GiB already allocated; 313.75 MiB free; 13.51 GiB reserved in total by PyTorch) If reserved memory is &amp;gt;&amp;gt; allocated memory try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF&lt;/SPAN&gt;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&lt;SPAN&gt;What would be a good way to solve this issue?&lt;/SPAN&gt;&lt;/P&gt;</description>
      <pubDate>Thu, 20 Jul 2023 16:28:09 GMT</pubDate>
      <guid>https://community.databricks.com/t5/machine-learning/cuda-out-of-memory/m-p/38052#M1974</guid>
      <dc:creator>gary7135</dc:creator>
      <dc:date>2023-07-20T16:28:09Z</dc:date>
    </item>
    <item>
      <title>Re: CUDA out of memory</title>
      <link>https://community.databricks.com/t5/machine-learning/cuda-out-of-memory/m-p/38124#M1977</link>
      <description>&lt;P&gt;Hi&amp;nbsp;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/49718"&gt;@gary7135&lt;/a&gt;,&lt;/P&gt;&lt;P&gt;Thank you for posting the question in the Databricks community.&lt;/P&gt;&lt;P&gt;&lt;SPAN&gt;Kindly update the configuration by setting &lt;/SPAN&gt;fp16=True&lt;SPAN&gt;&amp;nbsp;instead of its current value of false. For further information regarding the CUDA error related to this, please refer to this&amp;nbsp;&lt;A href="https://docs.databricks.com/machine-learning/train-model/huggingface/fine-tune-model.html#:~:text=to%20resolve%20them.-,OutOfMemoryError%3A%20CUDA%20out%20of%20memory,-When%20training%20large" target="_self"&gt;documentation&lt;/A&gt;&lt;/SPAN&gt;&lt;/P&gt;</description>
      <pubDate>Fri, 21 Jul 2023 14:41:46 GMT</pubDate>
      <guid>https://community.databricks.com/t5/machine-learning/cuda-out-of-memory/m-p/38124#M1977</guid>
      <dc:creator>Kumaran</dc:creator>
      <dc:date>2023-07-21T14:41:46Z</dc:date>
    </item>
    <item>
      <title>Re: CUDA out of memory</title>
      <link>https://community.databricks.com/t5/machine-learning/cuda-out-of-memory/m-p/38175#M1979</link>
      <description>&lt;P&gt;Thank you. Can you provide example of how to set this argument in notebooks?&lt;/P&gt;</description>
      <pubDate>Fri, 21 Jul 2023 17:54:20 GMT</pubDate>
      <guid>https://community.databricks.com/t5/machine-learning/cuda-out-of-memory/m-p/38175#M1979</guid>
      <dc:creator>gary7135</dc:creator>
      <dc:date>2023-07-21T17:54:20Z</dc:date>
    </item>
    <item>
      <title>Re: CUDA out of memory</title>
      <link>https://community.databricks.com/t5/machine-learning/cuda-out-of-memory/m-p/38179#M1980</link>
      <description>&lt;P&gt;Hello&amp;nbsp;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/49718"&gt;@gary7135&lt;/a&gt;,&lt;/P&gt;&lt;P&gt;Thank you for the response.&lt;/P&gt;&lt;P&gt;&lt;SPAN&gt;According to GitHub (you shared above), you should have a configuration file where you need to make this settings. Please refer to the image below for more details:&lt;/SPAN&gt;&lt;/P&gt;&lt;P&gt;&lt;span class="lia-inline-image-display-wrapper lia-image-align-inline" image-alt="Kumaran_0-1689964296082.png" style="width: 400px;"&gt;&lt;img src="https://community.databricks.com/t5/image/serverpage/image-id/2955iC7C84EBE0FDC404C/image-size/medium?v=v2&amp;amp;px=400" role="button" title="Kumaran_0-1689964296082.png" alt="Kumaran_0-1689964296082.png" /&gt;&lt;/span&gt;&lt;/P&gt;</description>
      <pubDate>Fri, 21 Jul 2023 18:32:57 GMT</pubDate>
      <guid>https://community.databricks.com/t5/machine-learning/cuda-out-of-memory/m-p/38179#M1980</guid>
      <dc:creator>Kumaran</dc:creator>
      <dc:date>2023-07-21T18:32:57Z</dc:date>
    </item>
    <item>
      <title>Re: CUDA out of memory</title>
      <link>https://community.databricks.com/t5/machine-learning/cuda-out-of-memory/m-p/38189#M1986</link>
      <description>&lt;P&gt;Thank you. I am running this python file directly in Databricks notebook&amp;nbsp;&lt;A href="https://github.com/databricks/databricks-ml-examples/blob/master/llm-models/llamav2/llamav2-7b/01_load_inference.py" target="_blank"&gt;https://github.com/databricks/databricks-ml-examples/blob/master/llm-models/llamav2/llamav2-7b/01_load_inference.py&lt;/A&gt;&lt;/P&gt;&lt;P&gt;The file does not seem to reference the config json file?&lt;/P&gt;</description>
      <pubDate>Fri, 21 Jul 2023 20:25:55 GMT</pubDate>
      <guid>https://community.databricks.com/t5/machine-learning/cuda-out-of-memory/m-p/38189#M1986</guid>
      <dc:creator>gary7135</dc:creator>
      <dc:date>2023-07-21T20:25:55Z</dc:date>
    </item>
    <item>
      <title>Re: CUDA out of memory</title>
      <link>https://community.databricks.com/t5/machine-learning/cuda-out-of-memory/m-p/38215#M1989</link>
      <description>&lt;P&gt;Hi&amp;nbsp;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/63081"&gt;@Kumaran&lt;/a&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;&lt;SPAN&gt;Hope you are well. Just wanted to see if you were able to find an answer to your question and would you like to mark an answer as best? It would be really helpful for the other members too.&lt;/SPAN&gt;&lt;/P&gt;
&lt;P&gt;&lt;SPAN&gt;Cheers!&lt;/SPAN&gt;&lt;/P&gt;</description>
      <pubDate>Sun, 23 Jul 2023 04:43:07 GMT</pubDate>
      <guid>https://community.databricks.com/t5/machine-learning/cuda-out-of-memory/m-p/38215#M1989</guid>
      <dc:creator>Anonymous</dc:creator>
      <dc:date>2023-07-23T04:43:07Z</dc:date>
    </item>
    <item>
      <title>Re: CUDA out of memory</title>
      <link>https://community.databricks.com/t5/machine-learning/cuda-out-of-memory/m-p/38251#M1991</link>
      <description>&lt;P&gt;Hi&amp;nbsp;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/49718"&gt;@gary7135&lt;/a&gt;,&lt;/P&gt;&lt;P&gt;&lt;SPAN&gt;If you are now following the same as GitHub.&lt;/SPAN&gt;You may somehow work on how to point out the configuration&amp;nbsp;&lt;SPAN&gt;fp16=True to your file,&lt;/SPAN&gt;&lt;/P&gt;</description>
      <pubDate>Mon, 24 Jul 2023 07:44:21 GMT</pubDate>
      <guid>https://community.databricks.com/t5/machine-learning/cuda-out-of-memory/m-p/38251#M1991</guid>
      <dc:creator>Kumaran</dc:creator>
      <dc:date>2023-07-24T07:44:21Z</dc:date>
    </item>
  </channel>
</rss>

