<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Re: Pyspark CSV Incorrect Count in Data Engineering</title>
    <link>https://community.databricks.com/t5/data-engineering/pyspark-csv-incorrect-count/m-p/30690#M22274</link>
    <description>&lt;P&gt;Hi @Kaniz Fatma​&amp;nbsp;Unfortunately, the suggestion hasn't helped and I've not been able to figure out the reason for the strange results so far.&lt;/P&gt;</description>
    <pubDate>Tue, 04 Oct 2022 08:00:00 GMT</pubDate>
    <dc:creator>Tarique</dc:creator>
    <dc:date>2022-10-04T08:00:00Z</dc:date>
    <item>
      <title>Pyspark CSV Incorrect Count</title>
      <link>https://community.databricks.com/t5/data-engineering/pyspark-csv-incorrect-count/m-p/30686#M22270</link>
      <description>&lt;PRE&gt;&lt;CODE&gt;B1123451020-502,"","{""m"": {""difference"": 60}}","","","",2022-02-12T15:40:00.783Z
B1456741975-266,"","{""m"": {""difference"": 60}}","","","",2022-02-04T17:03:59.566Z
B1789753479-460,"","",",","","",2022-02-18T14:46:57.332Z
B1456741977-123,"","{""m"": {""difference"": 60}}","","","",2022-02-04T17:03:59.566Z&lt;/CODE&gt;&lt;/PRE&gt;&lt;PRE&gt;&lt;CODE&gt;df_inputfile = (spark.read.format("com.databricks.spark.csv")
                                     .option("inferSchema", "true")
                                     .option("header","false")                
                                     .option("quotedstring",'\"')
                                     .option("escape",'\"')
                                     .option("multiline","true")
                                     .option("delimiter",",")
                                     .load('&amp;lt;path to csv&amp;gt;'))
&amp;nbsp;
print(df_inputfile.count()) # Prints 3
print(df_inputfile.distinct().count()) # Prints 4&lt;/CODE&gt;&lt;/PRE&gt;&lt;P&gt;I'm trying to read the data above from a CSV file and end up with a wrong count, although the dataframe contains all the expected records.&amp;nbsp;df_inputfile.count()&amp;nbsp;prints 3 although it should have been 4. &lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;It looks like this is happening because of the single comma in the 4th column of the 3rd row. Can someone please explain why?&lt;/P&gt;</description>
      <pubDate>Tue, 27 Sep 2022 13:58:01 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/pyspark-csv-incorrect-count/m-p/30686#M22270</guid>
      <dc:creator>Tarique</dc:creator>
      <dc:date>2022-09-27T13:58:01Z</dc:date>
    </item>
    <item>
      <title>Re: Pyspark CSV Incorrect Count</title>
      <link>https://community.databricks.com/t5/data-engineering/pyspark-csv-incorrect-count/m-p/30689#M22273</link>
      <description>&lt;P&gt;Hi Debayan, there's no syntax error in the code snippet. Using .option("escape",'"') makes no difference to the counts. I still get wrong counts.&lt;/P&gt;</description>
      <pubDate>Tue, 04 Oct 2022 07:57:20 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/pyspark-csv-incorrect-count/m-p/30689#M22273</guid>
      <dc:creator>Tarique</dc:creator>
      <dc:date>2022-10-04T07:57:20Z</dc:date>
    </item>
    <item>
      <title>Re: Pyspark CSV Incorrect Count</title>
      <link>https://community.databricks.com/t5/data-engineering/pyspark-csv-incorrect-count/m-p/30690#M22274</link>
      <description>&lt;P&gt;Hi @Kaniz Fatma​&amp;nbsp;Unfortunately, the suggestion hasn't helped and I've not been able to figure out the reason for the strange results so far.&lt;/P&gt;</description>
      <pubDate>Tue, 04 Oct 2022 08:00:00 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/pyspark-csv-incorrect-count/m-p/30690#M22274</guid>
      <dc:creator>Tarique</dc:creator>
      <dc:date>2022-10-04T08:00:00Z</dc:date>
    </item>
    <item>
      <title>Re: Pyspark CSV Incorrect Count</title>
      <link>https://community.databricks.com/t5/data-engineering/pyspark-csv-incorrect-count/m-p/30691#M22275</link>
      <description>&lt;P&gt;Hi @Tarique Anwer​&amp;nbsp;&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Hope all is well! &lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Just wanted to check in if you were able to resolve your issue and would you be happy to share the solution or &lt;B&gt;mark an answer as best&lt;/B&gt;? Else please let us know if you need more help.&amp;nbsp;&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;We'd love to hear from you.&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Thanks!&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;&lt;/P&gt;</description>
      <pubDate>Mon, 21 Nov 2022 03:43:24 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/pyspark-csv-incorrect-count/m-p/30691#M22275</guid>
      <dc:creator>Anonymous</dc:creator>
      <dc:date>2022-11-21T03:43:24Z</dc:date>
    </item>
    <item>
      <title>Re: Pyspark CSV Incorrect Count</title>
      <link>https://community.databricks.com/t5/data-engineering/pyspark-csv-incorrect-count/m-p/30687#M22271</link>
      <description>&lt;P&gt;Hi, Could you please check the syntax? '\"' ?&lt;/P&gt;</description>
      <pubDate>Fri, 30 Sep 2022 06:23:05 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/pyspark-csv-incorrect-count/m-p/30687#M22271</guid>
      <dc:creator>Debayan</dc:creator>
      <dc:date>2022-09-30T06:23:05Z</dc:date>
    </item>
  </channel>
</rss>

