Can someone explain why this below code is throwing an error? My intuition is telling me it's my spark version (3.2.1) but would like confirmation:
d = {'key':['a','a','c','d','e','f','g','h'],
'data':[1,2,3,4,5,6,7,8]}
x = ps.DataFrame(d)
x[x['key'].duplicated()]
-----------------------------------------------------------------------------
PandasNotImplementedError Traceback (most recent call last)
<command-6518367> in <module>
3
4 x = ps.DataFrame(d)
----> 5 x[x['key'].duplicated()]
/databricks/spark/python/pyspark/pandas/usage_logging/__init__.py in wrapper(*args, **kwargs)
192 start = time.perf_counter()
193 try:
--> 194 res = func(*args, **kwargs)
195 logger.log_success(
196 class_name, function_name, time.perf_counter() - start, signature
/databricks/spark/python/pyspark/pandas/series.py in __getattr__(self, item)
6276 property_or_func = getattr(MissingPandasLikeSeries, item)
6277 if isinstance(property_or_func, property):
-> 6278 return property_or_func.fget(self) # type: ignore
6279 else:
6280 return partial(property_or_func, self)
/databricks/spark/python/pyspark/pandas/usage_logging/__init__.py in wrapper(self)
261 def wrapper(self):
262 try:
--> 263 return prop.fget(self)
264 finally:
265 logger.log_missing(class_name, property_name, is_deprecated)
/databricks/spark/python/pyspark/pandas/missing/__init__.py in unsupported_property(self)
36 @property
37 def unsupported_property(self):
---> 38 raise PandasNotImplementedError(
39 class_name=class_name, property_name=property_name, reason=reason
40 )
PandasNotImplementedError: The property `pd.Series.duplicated()` is not implemented. 'duplicated' API returns np.ndarray and the data size is too large.You can just use DataFrame.deduplicated instead