<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic R Package Installation Best Practices in Get Started Discussions</title>
    <link>https://community.databricks.com/t5/get-started-discussions/r-package-installation-best-practices/m-p/79968#M7927</link>
    <description>&lt;P&gt;Hello,&lt;/P&gt;&lt;P&gt;We are new to databricks and are wondering what the best practices are for R package installation. We currently have cluster spin up wait times of more than 20 minutes with our init scripts. We have tried the following:&lt;/P&gt;&lt;P&gt;1. Libraries tab in the cluster preferences&lt;BR /&gt;2. Docker container&lt;BR /&gt;3. Init script shown below&amp;nbsp;&lt;BR /&gt;&lt;BR /&gt;Any help would be appreciated. We haven't been able to start development because of these wait times.&amp;nbsp;&lt;/P&gt;&lt;P&gt;Ann&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;DIV&gt;&lt;DIV&gt;&lt;SPAN&gt;#!/bin/bash&lt;/SPAN&gt;&lt;/DIV&gt;&lt;BR /&gt;&lt;DIV&gt;&lt;SPAN&gt;# Update package list and install system dependencies&lt;/SPAN&gt;&lt;/DIV&gt;&lt;DIV&gt;&lt;SPAN&gt;apt-get update -qq &amp;amp;&amp;amp; apt-get install -y -qq \&lt;/SPAN&gt;&lt;/DIV&gt;&lt;DIV&gt;&lt;SPAN&gt;gdal-bin \&lt;/SPAN&gt;&lt;/DIV&gt;&lt;DIV&gt;&lt;SPAN&gt;libgdal-dev \&lt;/SPAN&gt;&lt;/DIV&gt;&lt;DIV&gt;&lt;SPAN&gt;libudunits2-dev \&lt;/SPAN&gt;&lt;/DIV&gt;&lt;DIV&gt;&lt;SPAN&gt;libproj-dev \&lt;/SPAN&gt;&lt;/DIV&gt;&lt;DIV&gt;&lt;SPAN&gt;libgeos-dev \&lt;/SPAN&gt;&lt;/DIV&gt;&lt;DIV&gt;&lt;SPAN&gt;r-cran-covr \&lt;/SPAN&gt;&lt;/DIV&gt;&lt;DIV&gt;&lt;SPAN&gt;r-cran-inline \&lt;/SPAN&gt;&lt;/DIV&gt;&lt;DIV&gt;&lt;SPAN&gt;r-cran-pkgkitten \&lt;/SPAN&gt;&lt;/DIV&gt;&lt;DIV&gt;&lt;SPAN&gt;r-cran-tinytest \&lt;/SPAN&gt;&lt;/DIV&gt;&lt;DIV&gt;&lt;SPAN&gt;r-cran-xml2 \&lt;/SPAN&gt;&lt;/DIV&gt;&lt;DIV&gt;&lt;SPAN&gt;r-cran-zoo &lt;/SPAN&gt;&lt;/DIV&gt;&lt;BR /&gt;&lt;DIV&gt;&lt;SPAN&gt;# Install R packages&lt;/SPAN&gt;&lt;/DIV&gt;&lt;DIV&gt;&lt;SPAN&gt;R -e &lt;/SPAN&gt;&lt;SPAN&gt;"install.packages('prism', repos='&lt;A href="https://cloud.r-project.org/" target="_blank" rel="noopener"&gt;https://cloud.r-project.org/&lt;/A&gt;')"&lt;/SPAN&gt;&lt;/DIV&gt;&lt;BR /&gt;&lt;DIV&gt;&lt;SPAN&gt;# Install Python packages&lt;/SPAN&gt;&lt;/DIV&gt;&lt;DIV&gt;&lt;SPAN&gt;/databricks/python3/bin/pip install cutadapt&lt;/SPAN&gt;&lt;/DIV&gt;&lt;BR /&gt;&lt;DIV&gt;&lt;SPAN&gt;# Install GDAL with pip&lt;/SPAN&gt;&lt;/DIV&gt;&lt;DIV&gt;&lt;SPAN&gt;/databricks/python3/bin/pip install GDAL==3.2.2.1&lt;/SPAN&gt;&lt;/DIV&gt;&lt;BR /&gt;&lt;DIV&gt;&lt;SPAN&gt;# Print completion message&lt;/SPAN&gt;&lt;/DIV&gt;&lt;DIV&gt;&lt;SPAN&gt;echo &lt;/SPAN&gt;&lt;SPAN&gt;"Initialization script completed successfully."&lt;/SPAN&gt;&lt;/DIV&gt;&lt;/DIV&gt;</description>
    <pubDate>Mon, 22 Jul 2024 17:55:10 GMT</pubDate>
    <dc:creator>araiho</dc:creator>
    <dc:date>2024-07-22T17:55:10Z</dc:date>
    <item>
      <title>R Package Installation Best Practices</title>
      <link>https://community.databricks.com/t5/get-started-discussions/r-package-installation-best-practices/m-p/79968#M7927</link>
      <description>&lt;P&gt;Hello,&lt;/P&gt;&lt;P&gt;We are new to databricks and are wondering what the best practices are for R package installation. We currently have cluster spin up wait times of more than 20 minutes with our init scripts. We have tried the following:&lt;/P&gt;&lt;P&gt;1. Libraries tab in the cluster preferences&lt;BR /&gt;2. Docker container&lt;BR /&gt;3. Init script shown below&amp;nbsp;&lt;BR /&gt;&lt;BR /&gt;Any help would be appreciated. We haven't been able to start development because of these wait times.&amp;nbsp;&lt;/P&gt;&lt;P&gt;Ann&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;DIV&gt;&lt;DIV&gt;&lt;SPAN&gt;#!/bin/bash&lt;/SPAN&gt;&lt;/DIV&gt;&lt;BR /&gt;&lt;DIV&gt;&lt;SPAN&gt;# Update package list and install system dependencies&lt;/SPAN&gt;&lt;/DIV&gt;&lt;DIV&gt;&lt;SPAN&gt;apt-get update -qq &amp;amp;&amp;amp; apt-get install -y -qq \&lt;/SPAN&gt;&lt;/DIV&gt;&lt;DIV&gt;&lt;SPAN&gt;gdal-bin \&lt;/SPAN&gt;&lt;/DIV&gt;&lt;DIV&gt;&lt;SPAN&gt;libgdal-dev \&lt;/SPAN&gt;&lt;/DIV&gt;&lt;DIV&gt;&lt;SPAN&gt;libudunits2-dev \&lt;/SPAN&gt;&lt;/DIV&gt;&lt;DIV&gt;&lt;SPAN&gt;libproj-dev \&lt;/SPAN&gt;&lt;/DIV&gt;&lt;DIV&gt;&lt;SPAN&gt;libgeos-dev \&lt;/SPAN&gt;&lt;/DIV&gt;&lt;DIV&gt;&lt;SPAN&gt;r-cran-covr \&lt;/SPAN&gt;&lt;/DIV&gt;&lt;DIV&gt;&lt;SPAN&gt;r-cran-inline \&lt;/SPAN&gt;&lt;/DIV&gt;&lt;DIV&gt;&lt;SPAN&gt;r-cran-pkgkitten \&lt;/SPAN&gt;&lt;/DIV&gt;&lt;DIV&gt;&lt;SPAN&gt;r-cran-tinytest \&lt;/SPAN&gt;&lt;/DIV&gt;&lt;DIV&gt;&lt;SPAN&gt;r-cran-xml2 \&lt;/SPAN&gt;&lt;/DIV&gt;&lt;DIV&gt;&lt;SPAN&gt;r-cran-zoo &lt;/SPAN&gt;&lt;/DIV&gt;&lt;BR /&gt;&lt;DIV&gt;&lt;SPAN&gt;# Install R packages&lt;/SPAN&gt;&lt;/DIV&gt;&lt;DIV&gt;&lt;SPAN&gt;R -e &lt;/SPAN&gt;&lt;SPAN&gt;"install.packages('prism', repos='&lt;A href="https://cloud.r-project.org/" target="_blank" rel="noopener"&gt;https://cloud.r-project.org/&lt;/A&gt;')"&lt;/SPAN&gt;&lt;/DIV&gt;&lt;BR /&gt;&lt;DIV&gt;&lt;SPAN&gt;# Install Python packages&lt;/SPAN&gt;&lt;/DIV&gt;&lt;DIV&gt;&lt;SPAN&gt;/databricks/python3/bin/pip install cutadapt&lt;/SPAN&gt;&lt;/DIV&gt;&lt;BR /&gt;&lt;DIV&gt;&lt;SPAN&gt;# Install GDAL with pip&lt;/SPAN&gt;&lt;/DIV&gt;&lt;DIV&gt;&lt;SPAN&gt;/databricks/python3/bin/pip install GDAL==3.2.2.1&lt;/SPAN&gt;&lt;/DIV&gt;&lt;BR /&gt;&lt;DIV&gt;&lt;SPAN&gt;# Print completion message&lt;/SPAN&gt;&lt;/DIV&gt;&lt;DIV&gt;&lt;SPAN&gt;echo &lt;/SPAN&gt;&lt;SPAN&gt;"Initialization script completed successfully."&lt;/SPAN&gt;&lt;/DIV&gt;&lt;/DIV&gt;</description>
      <pubDate>Mon, 22 Jul 2024 17:55:10 GMT</pubDate>
      <guid>https://community.databricks.com/t5/get-started-discussions/r-package-installation-best-practices/m-p/79968#M7927</guid>
      <dc:creator>araiho</dc:creator>
      <dc:date>2024-07-22T17:55:10Z</dc:date>
    </item>
    <item>
      <title>Re: R Package Installation Best Practices</title>
      <link>https://community.databricks.com/t5/get-started-discussions/r-package-installation-best-practices/m-p/80373#M7928</link>
      <description>&lt;P&gt;I wanted to add that with this script I cannot load prism or sf packages. I think there is something going on with the directories that gdal and proj are installed to.&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Wed, 24 Jul 2024 13:15:58 GMT</pubDate>
      <guid>https://community.databricks.com/t5/get-started-discussions/r-package-installation-best-practices/m-p/80373#M7928</guid>
      <dc:creator>araiho</dc:creator>
      <dc:date>2024-07-24T13:15:58Z</dc:date>
    </item>
    <item>
      <title>Re: R Package Installation Best Practices</title>
      <link>https://community.databricks.com/t5/get-started-discussions/r-package-installation-best-practices/m-p/80599#M7930</link>
      <description>&lt;P&gt;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/9"&gt;@Retired_mod&lt;/a&gt;&amp;nbsp;Thank you for your detailed response! I think we would like to use Docker if we can because we are not using RStudio but R directly in the databricks notebooks and workflows. So, anymore information about R and Docker and Databricks would also be useful. Currently, this docker code builds successfully and is archived successfully but is not deploying on Datatbricks.&amp;nbsp;&lt;BR /&gt;&lt;BR /&gt;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;LI-CODE lang="markup"&gt;# syntax=docker/dockerfile:1.2

# Stage 1: Build R environment with Rocker
FROM --platform=linux/amd64 rocker/r-base:latest AS rbuilder

# Install required R packages in the Rocker image
RUN apt-get update &amp;amp;&amp;amp; apt-get install -y \
    r-cran-dplyr \
    r-cran-ggplot2 \
    r-cran-tidyr \
    &amp;amp;&amp;amp; apt-get clean \
    &amp;amp;&amp;amp; rm -rf /var/lib/apt/lists/*

# Stage 2: Use Databricks image and copy R installation from Rocker
FROM --platform=linux/amd64 databricksruntime/standard:latest

# Copy R binaries and libraries from the Rocker image
COPY --from=rbuilder /usr/lib/R /usr/lib/R
COPY --from=rbuilder /usr/share/R /usr/share/R
COPY --from=rbuilder /etc/R /etc/R
COPY --from=rbuilder /usr/bin/R /usr/bin/R
COPY --from=rbuilder /usr/bin/Rscript /usr/bin/Rscript

# Ensure the R library paths are correctly set
ENV R_HOME=/usr/lib/R
ENV PATH=$PATH:/usr/lib/R/bin

# Copy R packages from the previous stage
COPY --from=rbuilder /usr/lib/R/site-library /usr/local/lib/R/site-library
COPY --from=rbuilder /usr/lib/x86_64-linux-gnu/ /usr/lib/x86_64-linux-gnu/&lt;/LI-CODE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&lt;BR /&gt;&lt;BR /&gt;I have solved my dependency problem with the following code in my notebook, but I am a bit confused why it works&amp;nbsp;&lt;SPAN&gt;because the PROJ_LIB has to be set to&amp;nbsp;&lt;/SPAN&gt;/usr/share/proj&lt;SPAN&gt;&amp;nbsp;and then reset in the install of sf and prism to&amp;nbsp;&lt;/SPAN&gt;/lib/x86_64-linux-gnu&lt;SPAN&gt;&amp;nbsp;and then the repo for sf has to be&amp;nbsp;&lt;/SPAN&gt;&lt;A class="" href="https://cran.r-project.org/" target="_blank" rel="noopener noreferrer"&gt;https://cran.r-project.org&lt;/A&gt;&lt;SPAN&gt;&amp;nbsp;but could be&amp;nbsp;&lt;/SPAN&gt;&lt;A class="" href="https://packagemanager.rstudio.com/cran/__linux__/focal/latest" target="_blank" rel="noopener noreferrer"&gt;https://packagemanager.rstudio.com/cran/__linux__/focal/latest&lt;/A&gt;&lt;SPAN&gt;&amp;nbsp;for prism. I would like to use the second repo as much as possible to install R packages because it is much faster than CRAN.&amp;nbsp;&lt;BR /&gt;&lt;/SPAN&gt;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;LI-CODE lang="markup"&gt;%r
system('sudo apt-get -y update &amp;amp;&amp;amp; apt-get install -y  libudunits2-dev libgdal-dev libgeos-dev libproj-dev')

%sh
ldconfig -p | grep gdal
ldconfig -p | grep geos
ldconfig -p | grep proj

%r
options(HTTPUserAgent = sprintf(
  "R/%s R (%s)", 
  getRversion(), 
  paste(
    getRversion(), 
    R.version["platform"], 
    R.version["arch"], 
    R.version["os"]
  )
))

Sys.setenv(PROJ_LIB = "/usr/share/proj")

install.packages('units', lib='/databricks/spark/R/lib/',
  repos="https://cran.r-project.org")
install.packages('sf', 
  configure.args = "--with-proj-lib=/lib/x86_64-linux-gnu --with-proj-include=/usr/include",
  lib='/databricks/spark/R/lib/',
  repos="https://cran.r-project.org"
)

library(sf, lib.loc='/databricks/spark/R/lib/')
install.packages('prism', 
  configure.args = "--with-proj-lib=/lib/x86_64-linux-gnu --with-proj-include=/usr/include",
  lib='/databricks/spark/R/lib/',
  repos = c(CRAN = "https://packagemanager.rstudio.com/cran/__linux__/focal/latest")
)
library(prism, lib.loc='/databricks/spark/R/lib/')&lt;/LI-CODE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&lt;SPAN&gt;Anyway! Thank you again for answering.&lt;BR /&gt;&lt;BR /&gt;&lt;/SPAN&gt;&lt;/P&gt;</description>
      <pubDate>Thu, 25 Jul 2024 16:01:10 GMT</pubDate>
      <guid>https://community.databricks.com/t5/get-started-discussions/r-package-installation-best-practices/m-p/80599#M7930</guid>
      <dc:creator>araiho</dc:creator>
      <dc:date>2024-07-25T16:01:10Z</dc:date>
    </item>
  </channel>
</rss>

