I am currently trying to unzip files recursively from one folder(source folder) and copy all the unzipped files into the destination folder using databricks(pyspark). The destination path is still empty even after running this code. I tried looking for solutions online, but most of them are for a single zip file only.
import os
import zipfile
from pyspark.dbutils import DBUtils
local_file_path = dbutils.fs.ls('abfss://storage@container.dfs.core.windows.net/zipped_folder')
dest_file_path = dbutils.fs.ls('abfss://storage@container.dfs.core.windows.net/unzipped_folder')
def unzip_file(file_path, extract_path):
# Copy the file to local file system
local_path = "/tmp/" + os.path.basename(file_path)
dbutils.fs.cp(file_path, "file:" + local_path, recurse=True)
# Unzip the file on local file system
with zipfile.ZipFile(local_path, 'r') as zip_ref:
zip_ref.extractall("/tmp")
# Create the destination folder if it doesn't exist
dbutils.fs.mkdirs(extract_path)
# Copy the unzipped files to the destination folder
for root, dirs, files in os.walk("/tmp"):
for filename in files:
if filename != os.path.basename(file_path):
local_file = os.path.join(root, filename)
dest_file = extract_path + "/" + os.path.relpath(local_file, "/tmp")
try:
dbutils.fs.cp("file:" + local_file, dest_file, recurse=True) # Remove 'overwrite=True'
except Exception as e:
print(f"Error copying file: {e}")
# Remove the temporary files
dbutils.fs.rm(local_path, recurse=True)
# Call the unzip_file function with the appropriate arguments
if dest_file_path:
unzip_file(str(local_file_path[0].path), str(dest_file_path[0].path))