Options
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
07-23-2024 02:19 AM
Try using this code .
import pyspark
from pyspark.sql import SparkSession
# Initialize Spark session
spark = SparkSession.builder.appName("OracleToDatabricks").getOrCreate()
# Oracle connection properties
conn = "jdbc:oracle:thin:@//<host>:<port>/<service_name>"
user = "<username>"
pwd = "<password>"
driver = "oracle.jdbc.driver.OracleDriver"
pQuery = "<table_name>"
lbound = 1
ubound = 1000000
batch_size = 10000
properties = {
"user": user,
"password": pwd,
"driver": driver,
"autoReconnect": "true",
"numPartitions": "20",
"partitionColumn": "PARTITION_KEY",
"lowerBound": lbound,
"upperBound": ubound,
"fetchSize": "10000"
}
for i in range(lbound, ubound, batch_size):
lower_bound = i
upper_bound = min(i + batch_size - 1, ubound)
query = f"(SELECT * FROM {pQuery} WHERE PARTITION_KEY >= {lower_bound} AND PARTITION_KEY <= {upper_bound}) AS TEMP"
df = spark.read.jdbc(url=conn, table=query, properties=properties)
# Process and write the data to ADLS
df.write.mode("append").parquet("path/to/adls")
Rishabh Pandey