<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Re: DBUtils from databricks-connect and runtime are quite different libraries.... in Data Engineering</title>
    <link>https://community.databricks.com/t5/data-engineering/dbutils-from-databricks-connect-and-runtime-are-quite-different/m-p/90685#M37974</link>
    <description>&lt;P&gt;To make things more confusing, the Databricks SDK definition of `FileInfo` changes again:&lt;/P&gt;&lt;LI-CODE lang="python"&gt;@dataclass
class FileInfo:
    file_size: Optional[int] = None
    """The length of the file in bytes. This field is omitted for directories."""

    is_dir: Optional[bool] = None
    """True if the path is a directory."""

    modification_time: Optional[int] = None
    """Last modification time of given file in milliseconds since epoch."""

    path: Optional[str] = None
    """The absolute path of the file or directory."""

    def as_dict(self) -&amp;gt; dict:
        """Serializes the FileInfo into a dictionary suitable for use as a JSON request body."""
        body = {}
        if self.file_size is not None: body['file_size'] = self.file_size
        if self.is_dir is not None: body['is_dir'] = self.is_dir
        if self.modification_time is not None: body['modification_time'] = self.modification_time
        if self.path is not None: body['path'] = self.path
        return body

    @classmethod
    def from_dict(cls, d: Dict[str, any]) -&amp;gt; FileInfo:
        """Deserializes the FileInfo from a dictionary."""
        return cls(file_size=d.get('file_size', None),
                   is_dir=d.get('is_dir', None),
                   modification_time=d.get('modification_time', None),
                   path=d.get('path', None))&lt;/LI-CODE&gt;</description>
    <pubDate>Tue, 17 Sep 2024 07:35:26 GMT</pubDate>
    <dc:creator>stevenayers-bge</dc:creator>
    <dc:date>2024-09-17T07:35:26Z</dc:date>
    <item>
      <title>DBUtils from databricks-connect and runtime are quite different libraries....</title>
      <link>https://community.databricks.com/t5/data-engineering/dbutils-from-databricks-connect-and-runtime-are-quite-different/m-p/90682#M37973</link>
      <description>&lt;P&gt;If you find yourself using dbutils in any of your code, and you're testing locally vs running on a cluster, there's a few gotchas to be very careful of when it comes to listing files in Volumes or files on DBFS.&lt;/P&gt;&lt;P&gt;The DBUtils you'll use locally installed by databricks-connect:&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;LI-CODE lang="python"&gt;from databricks.connect import DatabricksSession
from pyspark.dbutils import DBUtils

spark = DatabricksSession.builder.profile('dev').getOrCreate()
dbutils = DBUtils(spark)&lt;/LI-CODE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;...compared to the one already instantiated in your notebooks, these are entirely different libraries with different interfaces, which led me down a very frustrating rabbit hole yesterday afternoon.&lt;/P&gt;&lt;P&gt;If you run `my_files = dbutils.fs.ls(&amp;lt;your path here&amp;gt;)` and you want to find out if the `FileInfo` objects you got back are a directory or a file, the behaviour differs.&lt;/P&gt;&lt;P&gt;Locally:&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;LI-CODE lang="python"&gt;my_files = dbutils.fs.ls(/some/path/to/files)
first_file = my_files[0]

first_file.isDir()
# ERRORS - function does not exist

first_file.size
# This will be None if a directory&lt;/LI-CODE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;In a Databricks notebook:&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;LI-CODE lang="python"&gt;my_files = dbutils.fs.ls(/some/path/to/files)
first_file = my_files[0]

first_file.isDir()
# Returns boolean true/false

first_file.size
# This will be zero as an integer if a directory &lt;/LI-CODE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;If you do need to reliably check if a `FileInfo` object is a directory across both environments, you can emulate the `.isDir()` function by using `first_file.name.endswith('/')`. Below is the FileInfo definition from runtime:&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;LI-CODE lang="python"&gt;# ************* DBC Public API ***************


# This class definition should be kept in sync with FileInfo definition in runtime dbutils.py
# See https://livegrep.dev.databricks.com/view/databricks/runtime/python/pyspark/dbutils.py#L60
class FileInfo(namedtuple('FileInfo', ['path', 'name', 'size', "modificationTime"])):
    def isDir(self):
        return self.name.endswith('/')

    def isFile(self):
        return not self.isDir()

    @staticmethod
    def create_from_jschema(j_file_info):
        return FileInfo(
            path=j_file_info.path(),
            name=j_file_info.name(),
            size=j_file_info.size(),
            modificationTime=j_file_info.modificationTime())&lt;/LI-CODE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Tue, 17 Sep 2024 07:27:01 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/dbutils-from-databricks-connect-and-runtime-are-quite-different/m-p/90682#M37973</guid>
      <dc:creator>stevenayers-bge</dc:creator>
      <dc:date>2024-09-17T07:27:01Z</dc:date>
    </item>
    <item>
      <title>Re: DBUtils from databricks-connect and runtime are quite different libraries....</title>
      <link>https://community.databricks.com/t5/data-engineering/dbutils-from-databricks-connect-and-runtime-are-quite-different/m-p/90685#M37974</link>
      <description>&lt;P&gt;To make things more confusing, the Databricks SDK definition of `FileInfo` changes again:&lt;/P&gt;&lt;LI-CODE lang="python"&gt;@dataclass
class FileInfo:
    file_size: Optional[int] = None
    """The length of the file in bytes. This field is omitted for directories."""

    is_dir: Optional[bool] = None
    """True if the path is a directory."""

    modification_time: Optional[int] = None
    """Last modification time of given file in milliseconds since epoch."""

    path: Optional[str] = None
    """The absolute path of the file or directory."""

    def as_dict(self) -&amp;gt; dict:
        """Serializes the FileInfo into a dictionary suitable for use as a JSON request body."""
        body = {}
        if self.file_size is not None: body['file_size'] = self.file_size
        if self.is_dir is not None: body['is_dir'] = self.is_dir
        if self.modification_time is not None: body['modification_time'] = self.modification_time
        if self.path is not None: body['path'] = self.path
        return body

    @classmethod
    def from_dict(cls, d: Dict[str, any]) -&amp;gt; FileInfo:
        """Deserializes the FileInfo from a dictionary."""
        return cls(file_size=d.get('file_size', None),
                   is_dir=d.get('is_dir', None),
                   modification_time=d.get('modification_time', None),
                   path=d.get('path', None))&lt;/LI-CODE&gt;</description>
      <pubDate>Tue, 17 Sep 2024 07:35:26 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/dbutils-from-databricks-connect-and-runtime-are-quite-different/m-p/90685#M37974</guid>
      <dc:creator>stevenayers-bge</dc:creator>
      <dc:date>2024-09-17T07:35:26Z</dc:date>
    </item>
    <item>
      <title>Re: DBUtils from databricks-connect and runtime are quite different libraries....</title>
      <link>https://community.databricks.com/t5/data-engineering/dbutils-from-databricks-connect-and-runtime-are-quite-different/m-p/90696#M37978</link>
      <description>&lt;P&gt;Hi&amp;nbsp;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/103678"&gt;@stevenayers-bge&lt;/a&gt;&amp;nbsp;,&lt;/P&gt;&lt;P&gt;Thanks for sharing. I didn't know that these interfaces aren't align with each other.&lt;/P&gt;</description>
      <pubDate>Tue, 17 Sep 2024 08:58:13 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/dbutils-from-databricks-connect-and-runtime-are-quite-different/m-p/90696#M37978</guid>
      <dc:creator>szymon_dybczak</dc:creator>
      <dc:date>2024-09-17T08:58:13Z</dc:date>
    </item>
  </channel>
</rss>

