<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Re: Avoiding metadata information when sending data to GCS in Data Engineering</title>
    <link>https://community.databricks.com/t5/data-engineering/avoiding-metadata-information-when-sending-data-to-gcs/m-p/119777#M45969</link>
    <description>&lt;P&gt;Thanks a lot&amp;nbsp;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/33816"&gt;@cgrant&lt;/a&gt;&amp;nbsp;. This removed&amp;nbsp; &amp;nbsp;'_started_...' , '_committed_..', but still generated _SUCCESS file.&lt;/P&gt;&lt;LI-CODE lang="python"&gt;spark.conf.set("mapreduce.fileoutputcommitter.marksuccessfuljobs", "false")&lt;/LI-CODE&gt;&lt;P&gt;this removed _SUCCESS files also.&lt;/P&gt;</description>
    <pubDate>Tue, 20 May 2025 14:41:52 GMT</pubDate>
    <dc:creator>aswinvishnu</dc:creator>
    <dc:date>2025-05-20T14:41:52Z</dc:date>
    <item>
      <title>Avoiding metadata information when sending data to GCS</title>
      <link>https://community.databricks.com/t5/data-engineering/avoiding-metadata-information-when-sending-data-to-gcs/m-p/119543#M45905</link>
      <description>&lt;P&gt;Hi all,&lt;BR /&gt;&lt;BR /&gt;I have use case where I need to push the table data to GCS bucket,&lt;/P&gt;&lt;LI-CODE lang="markup"&gt;query = "${QUERY}"

df = spark.sql(query)

gcs_path = "${GCS_PATH}"

df.write.option("maxRecordsPerFile", int("${MAX_RECORDS_PER_FILE}")).mode("${MODE}").json(gcs_path)&lt;/LI-CODE&gt;&lt;P&gt;This can push the results of the query to GCS, but this is generating some metadata files in the location&lt;BR /&gt;'_started_...'&lt;/P&gt;&lt;P&gt;'_committed_..'&lt;/P&gt;&lt;P&gt;I want to avoid this as I can't easily do a post processing in the bucket. Any help is appreciated.&lt;/P&gt;&lt;P&gt;Thanks,&lt;/P&gt;&lt;P&gt;Aswin Vishnu&lt;/P&gt;</description>
      <pubDate>Sun, 18 May 2025 03:46:45 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/avoiding-metadata-information-when-sending-data-to-gcs/m-p/119543#M45905</guid>
      <dc:creator>aswinvishnu</dc:creator>
      <dc:date>2025-05-18T03:46:45Z</dc:date>
    </item>
    <item>
      <title>Re: Avoiding metadata information when sending data to GCS</title>
      <link>https://community.databricks.com/t5/data-engineering/avoiding-metadata-information-when-sending-data-to-gcs/m-p/119686#M45943</link>
      <description>&lt;P&gt;Databricks has a special DBIO protocol that uses the _started and _committed files to transactionally write to cloud storage.&lt;/P&gt;
&lt;P&gt;You can disable this by setting the below spark config&lt;/P&gt;
&lt;PRE class="cm-s-eclipse capture-run-mode" data-reactid=".0.2.0.0.0.1:$0006b195-5e3a-4100-9c14-503c5e7d0e93.4.0.0.0"&gt;&lt;SPAN class="cm-comment"&gt;spark.conf.set("spark.sql.sources.commitProtocolClass", "org.apache.spark.sql.execution.datasources.SQLHadoopMapReduceCommitProtocol")&lt;/SPAN&gt;&lt;/PRE&gt;
&lt;P&gt;Also, you can read more about DBIO &lt;A href="https://www.databricks.com/blog/2017/05/31/transactional-writes-cloud-storage.html" target="_self"&gt;here&lt;/A&gt;&lt;/P&gt;</description>
      <pubDate>Tue, 20 May 2025 04:59:05 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/avoiding-metadata-information-when-sending-data-to-gcs/m-p/119686#M45943</guid>
      <dc:creator>cgrant</dc:creator>
      <dc:date>2025-05-20T04:59:05Z</dc:date>
    </item>
    <item>
      <title>Re: Avoiding metadata information when sending data to GCS</title>
      <link>https://community.databricks.com/t5/data-engineering/avoiding-metadata-information-when-sending-data-to-gcs/m-p/119777#M45969</link>
      <description>&lt;P&gt;Thanks a lot&amp;nbsp;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/33816"&gt;@cgrant&lt;/a&gt;&amp;nbsp;. This removed&amp;nbsp; &amp;nbsp;'_started_...' , '_committed_..', but still generated _SUCCESS file.&lt;/P&gt;&lt;LI-CODE lang="python"&gt;spark.conf.set("mapreduce.fileoutputcommitter.marksuccessfuljobs", "false")&lt;/LI-CODE&gt;&lt;P&gt;this removed _SUCCESS files also.&lt;/P&gt;</description>
      <pubDate>Tue, 20 May 2025 14:41:52 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/avoiding-metadata-information-when-sending-data-to-gcs/m-p/119777#M45969</guid>
      <dc:creator>aswinvishnu</dc:creator>
      <dc:date>2025-05-20T14:41:52Z</dc:date>
    </item>
  </channel>
</rss>

