<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Re: Extracting Topics From Text Data Using PySpark in Machine Learning</title>
    <link>https://community.databricks.com/t5/machine-learning/extracting-topics-from-text-data-using-pyspark/m-p/93363#M3721</link>
    <description>&lt;P&gt;Thanks&amp;nbsp;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/117376"&gt;@filipniziol&lt;/a&gt;&amp;nbsp; for quick response. Legend.&amp;nbsp;&lt;BR /&gt;That's right It needed to be converted to sparse vector.&amp;nbsp;&lt;/P&gt;</description>
    <pubDate>Thu, 10 Oct 2024 00:44:42 GMT</pubDate>
    <dc:creator>amirA</dc:creator>
    <dc:date>2024-10-10T00:44:42Z</dc:date>
    <item>
      <title>Extracting Topics From Text Data Using PySpark</title>
      <link>https://community.databricks.com/t5/machine-learning/extracting-topics-from-text-data-using-pyspark/m-p/93209#M3718</link>
      <description>&lt;P&gt;Hi Everyone&lt;BR /&gt;I tried to follow the same steps in &lt;A href="https://www.databricks.com/blog/2021/07/29/an-experimentation-pipeline-for-extracting-topics-from-text-data-using-pyspark.html" target="_self"&gt;Topic from Text&lt;/A&gt;&amp;nbsp;on similar data as example. However, when I tri to fit the model with data I get this error.&lt;BR /&gt;&lt;SPAN class=""&gt;IllegalArgumentException: &lt;/SPAN&gt;&lt;SPAN&gt;requirement failed: Column features must be of type equal to one of the following types: [struct&amp;lt;type:tinyint,size:int,indices:array&amp;lt;int&amp;gt;,values:array&amp;lt;double&amp;gt;&amp;gt;, array&amp;lt;double&amp;gt;, array&amp;lt;float&amp;gt;] but was actually of type struct&amp;lt;type:tinyint,size:int,indices:array&amp;lt;int&amp;gt;,values:array&amp;lt;double&amp;gt;&amp;gt;.&lt;/SPAN&gt;&lt;BR /&gt;My data:&lt;BR /&gt;Col =&amp;gt; { "&lt;SPAN&gt;vec_json&lt;/SPAN&gt;", "&lt;SPAN&gt;features&lt;/SPAN&gt;" }&lt;/P&gt;&lt;P&gt;&lt;SPAN&gt;Row(vec_json='[{"type":0,"size":4927,"indices":[0,8,18,30,145,336,786,1231,1695,3653],"values":[2.0,1.0,1.0,2.0,1.0,2.0,1.0,1.0,2.0,1.0]}]', features=Row(type=0, size=4927, indices=[0, 8, 18, 30, 145, 336, 786, 1231, 1695, 3653], values=[2.0, 1.0, 1.0, 2.0, 1.0, 2.0, 1.0, 1.0, 2.0, 1.0]))&lt;/SPAN&gt;&lt;/P&gt;&lt;DIV&gt;&lt;DIV&gt;&lt;SPAN&gt;lda_model &lt;/SPAN&gt;&lt;SPAN&gt;=&lt;/SPAN&gt; &lt;SPAN&gt;LDA&lt;/SPAN&gt;&lt;SPAN&gt;(&lt;/SPAN&gt;&lt;SPAN&gt;k&lt;/SPAN&gt;&lt;SPAN&gt;=&lt;/SPAN&gt;&lt;SPAN&gt;20&lt;/SPAN&gt;&lt;SPAN&gt;, &lt;/SPAN&gt;&lt;SPAN&gt;maxIter&lt;/SPAN&gt;&lt;SPAN&gt;=&lt;/SPAN&gt;&lt;SPAN&gt;20&lt;/SPAN&gt;&lt;SPAN&gt;)&lt;/SPAN&gt;&lt;/DIV&gt;&lt;DIV&gt;&lt;SPAN&gt;model &lt;/SPAN&gt;&lt;SPAN&gt;=&lt;/SPAN&gt;&lt;SPAN&gt; lda_model.&lt;/SPAN&gt;&lt;SPAN&gt;fit&lt;/SPAN&gt;&lt;SPAN&gt;(df_new)&lt;/SPAN&gt;&lt;/DIV&gt;&lt;DIV&gt;&lt;SPAN&gt;Many Thanks in advance&lt;/SPAN&gt;&lt;/DIV&gt;&lt;DIV&gt;&lt;SPAN&gt;Regards&lt;/SPAN&gt;&lt;/DIV&gt;&lt;DIV&gt;&lt;SPAN&gt;Amir&lt;/SPAN&gt;&lt;/DIV&gt;&lt;/DIV&gt;</description>
      <pubDate>Wed, 09 Oct 2024 01:56:36 GMT</pubDate>
      <guid>https://community.databricks.com/t5/machine-learning/extracting-topics-from-text-data-using-pyspark/m-p/93209#M3718</guid>
      <dc:creator>amirA</dc:creator>
      <dc:date>2024-10-09T01:56:36Z</dc:date>
    </item>
    <item>
      <title>Re: Extracting Topics From Text Data Using PySpark</title>
      <link>https://community.databricks.com/t5/machine-learning/extracting-topics-from-text-data-using-pyspark/m-p/93230#M3719</link>
      <description>&lt;P&gt;Hi&amp;nbsp;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/125493"&gt;@amirA&lt;/a&gt;&amp;nbsp;,&lt;BR /&gt;&lt;BR /&gt;The LDA model expects the features column to be of type Vector from the pyspark.ml.linalg module, specifically either a SparseVector or DenseVector, whereas you have provided Row type.&lt;BR /&gt;You need to convert your Row object to SparseVector.&lt;BR /&gt;&lt;BR /&gt;Check this out:&lt;BR /&gt;&lt;BR /&gt;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;LI-CODE lang="python"&gt;# Import required libraries
from pyspark.sql.functions import col, udf
from pyspark.sql.types import StructType, StructField, IntegerType, ArrayType, DoubleType, StringType
from pyspark.ml.linalg import SparseVector, Vectors, VectorUDT
from pyspark.ml.clustering import LDA

# Define schema for the input DataFrame
schema = StructType([
    StructField("vec_json", StringType(), True),
    StructField("features", StructType([
        StructField("type", IntegerType(), True),
        StructField("size", IntegerType(), True),
        StructField("indices", ArrayType(IntegerType()), True),
        StructField("values", ArrayType(DoubleType()), True)
    ]), True)
])

# Sample data
data = [
    ("[{\"type\":0,\"size\":4927,\"indices\":[0,8,18,30,145,336,786,1231,1695,3653],\"values\":[2.0,1.0,1.0,2.0,1.0,2.0,1.0,1.0,2.0,1.0]}]",
     (0, 4927, [0, 8, 18, 30, 145, 336, 786, 1231, 1695, 3653], [2.0, 1.0, 1.0, 2.0, 1.0, 2.0, 1.0, 1.0, 2.0, 1.0]))
]

# Create DataFrame
df = spark.createDataFrame(data, schema)

# Define a UDF to convert struct to SparseVector
def convert_to_sparse_vector(features):
    return Vectors.sparse(features['size'], features['indices'], features['values'])

# Register the UDF with the correct return type
convert_to_sparse_vector_udf = udf(convert_to_sparse_vector, VectorUDT())

# Convert the features column to SparseVector type
df_new = df.withColumn("features", convert_to_sparse_vector_udf(col("features")))

# Verify the schema to confirm that "features" is now a SparseVector
df_new.printSchema()

# Show the DataFrame to see the changes
df_new.show(truncate=False)

# Fit the LDA model
lda_model = LDA(k=20, maxIter=20)
model = lda_model.fit(df_new)

# Print the topics
topics = model.describeTopics()
topics.show(truncate=False)&lt;/LI-CODE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Wed, 09 Oct 2024 06:33:15 GMT</pubDate>
      <guid>https://community.databricks.com/t5/machine-learning/extracting-topics-from-text-data-using-pyspark/m-p/93230#M3719</guid>
      <dc:creator>filipniziol</dc:creator>
      <dc:date>2024-10-09T06:33:15Z</dc:date>
    </item>
    <item>
      <title>Re: Extracting Topics From Text Data Using PySpark</title>
      <link>https://community.databricks.com/t5/machine-learning/extracting-topics-from-text-data-using-pyspark/m-p/93363#M3721</link>
      <description>&lt;P&gt;Thanks&amp;nbsp;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/117376"&gt;@filipniziol&lt;/a&gt;&amp;nbsp; for quick response. Legend.&amp;nbsp;&lt;BR /&gt;That's right It needed to be converted to sparse vector.&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Thu, 10 Oct 2024 00:44:42 GMT</pubDate>
      <guid>https://community.databricks.com/t5/machine-learning/extracting-topics-from-text-data-using-pyspark/m-p/93363#M3721</guid>
      <dc:creator>amirA</dc:creator>
      <dc:date>2024-10-10T00:44:42Z</dc:date>
    </item>
    <item>
      <title>Re: Extracting Topics From Text Data Using PySpark</title>
      <link>https://community.databricks.com/t5/machine-learning/extracting-topics-from-text-data-using-pyspark/m-p/98749#M3783</link>
      <description>&lt;P&gt;Thank you so much for the solution.&lt;/P&gt;</description>
      <pubDate>Thu, 14 Nov 2024 07:47:39 GMT</pubDate>
      <guid>https://community.databricks.com/t5/machine-learning/extracting-topics-from-text-data-using-pyspark/m-p/98749#M3783</guid>
      <dc:creator>EmmaBlake</dc:creator>
      <dc:date>2024-11-14T07:47:39Z</dc:date>
    </item>
  </channel>
</rss>

