<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Issues loading files csv files that contain BOM (Byte Order Mark) character in Data Engineering</title>
    <link>https://community.databricks.com/t5/data-engineering/issues-loading-files-csv-files-that-contain-bom-byte-order-mark/m-p/2719#M41</link>
    <description>&lt;P&gt;I keep getting and error when creating dataframe or steam from certain CSV files where the header contains BOM (Byte Order Mark) character&amp;nbsp; &lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;This is the error message:&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;AnalysisException: [RequestId=e09c7c8d-2399-4d6a-84ae-216e6a9f8f6e ErrorClass=INVALID_PARAMETER_VALUE.INVALID_FIELD_LENGTH] CreateTable column_1.name too long. Maximum length is 255 characters.&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Below is the full error message&lt;/P&gt;&lt;PRE&gt;&lt;CODE&gt;File /databricks/spark/python/pyspark/sql/streaming/readwriter.py:1481, in DataStreamWriter.toTable(self, tableName, format, outputMode, partitionBy, queryName, **options)
   1479 if queryName is not None:
   1480     self.queryName(queryName)
-&amp;gt; 1481 return self._sq(self._jwrite.toTable(tableName))
&amp;nbsp;
File /databricks/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/java_gateway.py:1322, in JavaMember.__call__(self, *args)
   1316 command = proto.CALL_COMMAND_NAME +\
   1317     self.command_header +\
   1318     args_command +\
   1319     proto.END_COMMAND_PART
   1321 answer = self.gateway_client.send_command(command)
-&amp;gt; 1322 return_value = get_return_value(
   1323     answer, self.gateway_client, self.target_id, self.name)
   1325 for temp_arg in temp_args:
   1326     if hasattr(temp_arg, "_detach"):&lt;/CODE&gt;&lt;/PRE&gt;&lt;P&gt;And below is the code that normally works if the files header does not have the BOM. Notice I tried adding the encoding option but that doesn't work.&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;PRE&gt;&lt;CODE&gt;from pyspark.sql.functions import input_file_name, current_timestamp, lit
&amp;nbsp;
def startRawSteam(schema):
&amp;nbsp;
    # Configure Auto Loader 
    streaming_query = (spark.readStream
    .format("cloudFiles")
    .option("cloudFiles.format", "csv")
    .option("cloudFiles.schemaLocation", raw_checkpoint_path)
    .option("sep", ",")
    .option("inferSchema", "true")
    .option("lineSep", "\r\n")  # Specify the Windows-style EOL character (CRLF)
    .option("header", "false")
    .option("encoding", "UTF-8-sig")
    .option("pathGlobfilter", file_pattern)
    .load(f"{external_location}")
    .select("*", lit("unspecified").alias("hospital"), input_file_name().alias("source_file"), current_timestamp().alias("processing_time"))
    .writeStream
    .option("checkpointLocation", raw_checkpoint_path)
    .trigger(availableNow=True)
    .toTable(raw_table_name))
&amp;nbsp;
    return streaming_query&lt;/CODE&gt;&lt;/PRE&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Let me know if you have a solution for this&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;I think i need to modify the files but I don't know how to do that in databricks, I don't want download the files to my local machine since they are large and I'm not allowed to. The files are in s3 bucket and I would like to fix them there or in a databricks using dbutils&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Any help is appreciated &lt;/P&gt;</description>
    <pubDate>Thu, 22 Jun 2023 09:53:40 GMT</pubDate>
    <dc:creator>harraz</dc:creator>
    <dc:date>2023-06-22T09:53:40Z</dc:date>
    <item>
      <title>Issues loading files csv files that contain BOM (Byte Order Mark) character</title>
      <link>https://community.databricks.com/t5/data-engineering/issues-loading-files-csv-files-that-contain-bom-byte-order-mark/m-p/2719#M41</link>
      <description>&lt;P&gt;I keep getting and error when creating dataframe or steam from certain CSV files where the header contains BOM (Byte Order Mark) character&amp;nbsp; &lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;This is the error message:&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;AnalysisException: [RequestId=e09c7c8d-2399-4d6a-84ae-216e6a9f8f6e ErrorClass=INVALID_PARAMETER_VALUE.INVALID_FIELD_LENGTH] CreateTable column_1.name too long. Maximum length is 255 characters.&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Below is the full error message&lt;/P&gt;&lt;PRE&gt;&lt;CODE&gt;File /databricks/spark/python/pyspark/sql/streaming/readwriter.py:1481, in DataStreamWriter.toTable(self, tableName, format, outputMode, partitionBy, queryName, **options)
   1479 if queryName is not None:
   1480     self.queryName(queryName)
-&amp;gt; 1481 return self._sq(self._jwrite.toTable(tableName))
&amp;nbsp;
File /databricks/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/java_gateway.py:1322, in JavaMember.__call__(self, *args)
   1316 command = proto.CALL_COMMAND_NAME +\
   1317     self.command_header +\
   1318     args_command +\
   1319     proto.END_COMMAND_PART
   1321 answer = self.gateway_client.send_command(command)
-&amp;gt; 1322 return_value = get_return_value(
   1323     answer, self.gateway_client, self.target_id, self.name)
   1325 for temp_arg in temp_args:
   1326     if hasattr(temp_arg, "_detach"):&lt;/CODE&gt;&lt;/PRE&gt;&lt;P&gt;And below is the code that normally works if the files header does not have the BOM. Notice I tried adding the encoding option but that doesn't work.&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;PRE&gt;&lt;CODE&gt;from pyspark.sql.functions import input_file_name, current_timestamp, lit
&amp;nbsp;
def startRawSteam(schema):
&amp;nbsp;
    # Configure Auto Loader 
    streaming_query = (spark.readStream
    .format("cloudFiles")
    .option("cloudFiles.format", "csv")
    .option("cloudFiles.schemaLocation", raw_checkpoint_path)
    .option("sep", ",")
    .option("inferSchema", "true")
    .option("lineSep", "\r\n")  # Specify the Windows-style EOL character (CRLF)
    .option("header", "false")
    .option("encoding", "UTF-8-sig")
    .option("pathGlobfilter", file_pattern)
    .load(f"{external_location}")
    .select("*", lit("unspecified").alias("hospital"), input_file_name().alias("source_file"), current_timestamp().alias("processing_time"))
    .writeStream
    .option("checkpointLocation", raw_checkpoint_path)
    .trigger(availableNow=True)
    .toTable(raw_table_name))
&amp;nbsp;
    return streaming_query&lt;/CODE&gt;&lt;/PRE&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Let me know if you have a solution for this&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;I think i need to modify the files but I don't know how to do that in databricks, I don't want download the files to my local machine since they are large and I'm not allowed to. The files are in s3 bucket and I would like to fix them there or in a databricks using dbutils&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Any help is appreciated &lt;/P&gt;</description>
      <pubDate>Thu, 22 Jun 2023 09:53:40 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/issues-loading-files-csv-files-that-contain-bom-byte-order-mark/m-p/2719#M41</guid>
      <dc:creator>harraz</dc:creator>
      <dc:date>2023-06-22T09:53:40Z</dc:date>
    </item>
    <item>
      <title>Re: Issues loading files csv files that contain BOM (Byte Order Mark) character</title>
      <link>https://community.databricks.com/t5/data-engineering/issues-loading-files-csv-files-that-contain-bom-byte-order-mark/m-p/2720#M42</link>
      <description>&lt;P&gt;Hi @mohamed harraz​&amp;nbsp;&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Great to meet you, and thanks for your question!&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt; Let's see if your peers in the community have an answer to your question. Thanks.&lt;/P&gt;&lt;P&gt;&lt;/P&gt;</description>
      <pubDate>Fri, 23 Jun 2023 05:19:15 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/issues-loading-files-csv-files-that-contain-bom-byte-order-mark/m-p/2720#M42</guid>
      <dc:creator>Anonymous</dc:creator>
      <dc:date>2023-06-23T05:19:15Z</dc:date>
    </item>
  </channel>
</rss>

