We are having intermittent errors where a Job Task cannot access a Catalog through a Volume, with the error: `PermissionError: [Errno 1] Operation not permitted: '/Volumes/mycatalog'`.The Job has 40 tasks running in parallel and every few runs we experience this error in a different Task. Our workspace is on Azure and is Terraformed.
Stack trace:
File /local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.10/site-packages/watermark.py:137, in Watermark.write(self)
135 if not os.path.exists(self.path_base):
136 self.logger.debug(f"Creating directory: {self.path_base}")
--> 137 os.makedirs(self.path_base, exist_ok=True)
139 while current_retry < self.__max_retries and not success:
140 try:
File /usr/lib/python3.10/os.py:215, in makedirs(name, mode, exist_ok)
213 if head and tail and not path.exists(head):
214 try:
--> 215 makedirs(head, exist_ok=exist_ok)
216 except FileExistsError:
217 # Defeats race condition when another thread created the path
218 pass
File /usr/lib/python3.10/os.py:215, in makedirs(name, mode, exist_ok)
213 if head and tail and not path.exists(head):
214 try:
--> 215 makedirs(head, exist_ok=exist_ok)
216 except FileExistsError:
217 # Defeats race condition when another thread created the path
218 pass
[... skipping similar frames: makedirs at line 215 (2 times)]
File /usr/lib/python3.10/os.py:215, in makedirs(name, mode, exist_ok)
213 if head and tail and not path.exists(head):
214 try:
--> 215 makedirs(head, exist_ok=exist_ok)
216 except FileExistsError:
217 # Defeats race condition when another thread created the path
218 pass
File /usr/lib/python3.10/os.py:225, in makedirs(name, mode, exist_ok)
223 return
224 try:
--> 225 mkdir(name, mode)
226 except OSError:
227 # Cannot rely on checking for EEXIST, since the operating system
228 # could give priority to other errors like EACCES or EROFS
229 if not exist_ok or not path.isdir(name):
PermissionError: [Errno 1] Operation not permitted: '/Volumes/mycatalog'