<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Re: How to remove extra ENTER line in csv UTF-16 while reading in Data Engineering</title>
    <link>https://community.databricks.com/t5/data-engineering/how-to-remove-extra-enter-line-in-csv-utf-16-while-reading/m-p/12996#M7738</link>
    <description>&lt;P&gt;This is working fine,&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;&lt;span class="lia-inline-image-display-wrapper" image-alt="image"&gt;&lt;img src="https://community.databricks.com/t5/image/serverpage/image-id/874i22CDA01217600134/image-size/large?v=v2&amp;amp;px=999" role="button" title="image" alt="image" /&gt;&lt;/span&gt;&amp;nbsp;&lt;/P&gt;&lt;PRE&gt;&lt;CODE&gt;from pyspark.sql.functions import regexp_replace
&amp;nbsp;
path="dbfs:/FileStore/df/test.csv"
dff = spark.read.option("header", "true").option("inferSchema", "true").option('multiline', 'true').option('encoding', 'UTF-8').option("delimiter", "‡‡,‡‡").csv(path)
&amp;nbsp;
for i in dffs_headers:
  columnLabel = i[0]
  newColumnLabel = columnLabel.replace('‡‡','').replace('‡‡','')
  
  dff=dff.withColumn(newColumnLabel,regexp_replace(columnLabel,'^\\‡‡|\\‡‡$',''))
  
  if columnLabel != newColumnLabel:
    dff = dff.drop(columnLabel)
  dff.show(truncate=False)&lt;/CODE&gt;&lt;/PRE&gt;&lt;P&gt;Please select my answer as the best answer it will be a great help&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Thanks&lt;/P&gt;&lt;P&gt;Aviral Bhardwaj&lt;/P&gt;</description>
    <pubDate>Mon, 09 Jan 2023 04:33:55 GMT</pubDate>
    <dc:creator>Aviral-Bhardwaj</dc:creator>
    <dc:date>2023-01-09T04:33:55Z</dc:date>
    <item>
      <title>How to remove extra ENTER line in csv UTF-16 while reading</title>
      <link>https://community.databricks.com/t5/data-engineering/how-to-remove-extra-enter-line-in-csv-utf-16-while-reading/m-p/12995#M7737</link>
      <description>&lt;P&gt;Dear Friends,&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;I have a csv and it looks like this&lt;/P&gt;&lt;P&gt;‡‡Id‡‡,‡‡Version‡‡,‡‡Questionnaire‡‡,‡‡Date‡‡&lt;/P&gt;&lt;P&gt;‡‡123456‡‡,‡‡Version2‡‡,‡‡All questions have been answered accurately &lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;and the guidance in the questionnaire was understood and followed‡‡,‡‡2010-12-16 00:01:48.020000000‡‡&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;&lt;B&gt;There is an extra ENTER line&lt;/B&gt; "&lt;I&gt;and the guidance in the questionnaire was understood and followed"&lt;/I&gt; this part is coming as a new line in the csv. &amp;nbsp;Source file encoding is UTF-16 LE BOM.&lt;/P&gt;&lt;P&gt;At the end of every line, I have CRLF and at the end of every ENTER extra line, I have LF&lt;/P&gt;&lt;P&gt;I should mention in my code something like&amp;nbsp;&lt;B&gt;lineSep \r\n&amp;nbsp;&lt;/B&gt;,but how?&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;I wrote below code to read this csv &lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;dff = spark.read.option("header", "true") \&lt;/P&gt;&lt;P&gt;.option("inferSchema", "true") \&lt;/P&gt;&lt;P&gt;.option('multiline', 'true') \&lt;/P&gt;&lt;P&gt;.option('encoding', 'UTF-16') \&lt;/P&gt;&lt;P&gt;.option("delimiter", "‡‡,‡‡") \&lt;/P&gt;&lt;P&gt;.csv("/mnt/path/data.csv")&lt;/P&gt;&lt;P&gt;dffs_headers = dff.dtypes&lt;/P&gt;&lt;P&gt;for i in dffs_headers:&lt;/P&gt;&lt;P&gt;&amp;nbsp;&amp;nbsp;columnLabel = i[0]&lt;/P&gt;&lt;P&gt;&amp;nbsp;&amp;nbsp;newColumnLabel = columnLabel.replace('‡‡','').replace('‡‡','')&lt;/P&gt;&lt;P&gt;&amp;nbsp;&amp;nbsp;dff=dff.withColumn(newColumnLabel,regexp_replace(columnLabel,'^\\‡‡|\\‡‡$',''))&lt;/P&gt;&lt;P&gt;&amp;nbsp;&amp;nbsp;if columnLabel != newColumnLabel:&lt;/P&gt;&lt;P&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;dff = dff.drop(columnLabel)&lt;/P&gt;&lt;P&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;display(dff)&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;But, in the result is not correct as for the given Id, &lt;B&gt;Questionnaire column data is breaking &lt;/B&gt;after &lt;I&gt;"All questions have been answered accurately"&lt;/I&gt; and displayed in the next row. I want the entire textbetween the doubledagger "‡‡,‡‡" to be read as one row, even if there is any extra ENTER line.&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Please help friends @Aviral Bhardwaj​&amp;nbsp;&lt;/P&gt;&lt;P&gt;@DataBricksHelp232​&amp;nbsp;@Rahul@Databricks​&amp;nbsp;@Uma Dacharla​&amp;nbsp;@Uma Maheswara Rao Desula​&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Sun, 08 Jan 2023 16:20:09 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/how-to-remove-extra-enter-line-in-csv-utf-16-while-reading/m-p/12995#M7737</guid>
      <dc:creator>shamly</dc:creator>
      <dc:date>2023-01-08T16:20:09Z</dc:date>
    </item>
    <item>
      <title>Re: How to remove extra ENTER line in csv UTF-16 while reading</title>
      <link>https://community.databricks.com/t5/data-engineering/how-to-remove-extra-enter-line-in-csv-utf-16-while-reading/m-p/12996#M7738</link>
      <description>&lt;P&gt;This is working fine,&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;&lt;span class="lia-inline-image-display-wrapper" image-alt="image"&gt;&lt;img src="https://community.databricks.com/t5/image/serverpage/image-id/874i22CDA01217600134/image-size/large?v=v2&amp;amp;px=999" role="button" title="image" alt="image" /&gt;&lt;/span&gt;&amp;nbsp;&lt;/P&gt;&lt;PRE&gt;&lt;CODE&gt;from pyspark.sql.functions import regexp_replace
&amp;nbsp;
path="dbfs:/FileStore/df/test.csv"
dff = spark.read.option("header", "true").option("inferSchema", "true").option('multiline', 'true').option('encoding', 'UTF-8').option("delimiter", "‡‡,‡‡").csv(path)
&amp;nbsp;
for i in dffs_headers:
  columnLabel = i[0]
  newColumnLabel = columnLabel.replace('‡‡','').replace('‡‡','')
  
  dff=dff.withColumn(newColumnLabel,regexp_replace(columnLabel,'^\\‡‡|\\‡‡$',''))
  
  if columnLabel != newColumnLabel:
    dff = dff.drop(columnLabel)
  dff.show(truncate=False)&lt;/CODE&gt;&lt;/PRE&gt;&lt;P&gt;Please select my answer as the best answer it will be a great help&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Thanks&lt;/P&gt;&lt;P&gt;Aviral Bhardwaj&lt;/P&gt;</description>
      <pubDate>Mon, 09 Jan 2023 04:33:55 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/how-to-remove-extra-enter-line-in-csv-utf-16-while-reading/m-p/12996#M7738</guid>
      <dc:creator>Aviral-Bhardwaj</dc:creator>
      <dc:date>2023-01-09T04:33:55Z</dc:date>
    </item>
    <item>
      <title>Re: How to remove extra ENTER line in csv UTF-16 while reading</title>
      <link>https://community.databricks.com/t5/data-engineering/how-to-remove-extra-enter-line-in-csv-utf-16-while-reading/m-p/12997#M7739</link>
      <description>&lt;P&gt;Hi,&lt;/P&gt;&lt;P&gt;This is not working for me as the source file encoding is UTF-16 LE BOM.&lt;/P&gt;&lt;P&gt;At the end of every line, I have CRLF and at the end of every ENTER extra line,I have LF&lt;/P&gt;&lt;P&gt;I should mention in my code something like &lt;B&gt;lineSep \r\n     &lt;/B&gt;,but how?&lt;/P&gt;</description>
      <pubDate>Mon, 09 Jan 2023 07:57:16 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/how-to-remove-extra-enter-line-in-csv-utf-16-while-reading/m-p/12997#M7739</guid>
      <dc:creator>shamly</dc:creator>
      <dc:date>2023-01-09T07:57:16Z</dc:date>
    </item>
    <item>
      <title>Re: How to remove extra ENTER line in csv UTF-16 while reading</title>
      <link>https://community.databricks.com/t5/data-engineering/how-to-remove-extra-enter-line-in-csv-utf-16-while-reading/m-p/12998#M7740</link>
      <description>&lt;P&gt;connect with me here - &lt;A href="https://www.linkedin.com/in/aviralb/" target="test_blank"&gt;https://www.linkedin.com/in/aviralb/&lt;/A&gt;&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;We will try to solve in live call&lt;/P&gt;</description>
      <pubDate>Mon, 09 Jan 2023 08:12:06 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/how-to-remove-extra-enter-line-in-csv-utf-16-while-reading/m-p/12998#M7740</guid>
      <dc:creator>Aviral-Bhardwaj</dc:creator>
      <dc:date>2023-01-09T08:12:06Z</dc:date>
    </item>
  </channel>
</rss>

