File
<command-3201934447568746>, line 2 1 with open(
f"{articles_path
.replace(
'dbfs:',
'/dbfs/')
}2302.06476.pdf", mode
="rb")
as pdf:
----> 2 doc
= extract_doc_text(pdf.read()) 3 print(doc)
File
<command-3201934447568602>, line 8, in
extract_doc_text(x) 6 def extract_doc_text(x :
bytes)
-> str:
7 # Read files and extract the values with unstructured ----> 8 sections
= partition(file=io.BytesIO(x)) 9 def clean_section(txt):
10 txt
= re
.sub(
r'\n',
'', txt)
File /local_disk0/.ephemeral_nfs/envs/pythonEnv-2534a48d-f9f7-4daf-934e-2d9e08931bf6/lib/python3.10/site-packages/unstructured/partition/auto.py:383, in partition(filename, content_type, file, file_filename, url, include_page_breaks, strategy, encoding, paragraph_grouper, headers, skip_infer_table_types, ssl_verify, ocr_languages, languages, detect_language_per_element, pdf_infer_table_structure, pdf_extract_images, pdf_image_output_dir_path, xml_keep_tags, data_source_metadata, metadata_filename, request_timeout, **kwargs) 381 elif filetype == FileType.PDF: 382 _partition_pdf = _get_partition_with_extras("pdf") --> 383 elements = _partition_pdf( 384 filename=filename, # type: ignore 385 file=file, # type: ignore 386 url=None, 387 include_page_breaks=include_page_breaks, 388 infer_table_structure=infer_table_structure, 389 strategy=strategy, 390 languages=languages, 391 extract_images_in_pdf=pdf_extract_images, 392 image_output_dir_path=pdf_image_output_dir_path, 393 **kwargs, 394 ) 395 elif (filetype == FileType.PNG) or (filetype == FileType.JPG) or (filetype == FileType.TIFF): 396 elements = partition_image( 397 filename=filename, # type: ignore 398 file=file, # type: ignore (...) 404 **kwargs, 405 )
File /local_disk0/.ephemeral_nfs/envs/pythonEnv-2534a48d-f9f7-4daf-934e-2d9e08931bf6/lib/python3.10/site-packages/unstructured/documents/elements.py:371, in process_metadata.<locals>.decorator.<locals>.wrapper(*args, **kwargs) 369 @functools.wraps(func) 370 def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> List[Element]: --> 371 elements = func(*args, **kwargs) 372 sig = inspect.signature(func) 373 params: Dict[str, Any] = dict(**dict(zip(sig.parameters, args)), **kwargs)
File /local_disk0/.ephemeral_nfs/envs/pythonEnv-2534a48d-f9f7-4daf-934e-2d9e08931bf6/lib/python3.10/site-packages/unstructured/file_utils/filetype.py:591, in add_filetype.<locals>.decorator.<locals>.wrapper(*args, **kwargs) 589 @functools.wraps(func) 590 def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> List[Element]: --> 591 elements = func(*args, **kwargs) 592 sig = inspect.signature(func) 593 params: Dict[str, Any] = dict(**dict(zip(sig.parameters, args)), **kwargs)
File /local_disk0/.ephemeral_nfs/envs/pythonEnv-2534a48d-f9f7-4daf-934e-2d9e08931bf6/lib/python3.10/site-packages/unstructured/file_utils/filetype.py:546, in add_metadata.<locals>.wrapper(*args, **kwargs) 544 @functools.wraps(func) 545 def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> List[Element]: --> 546 elements = func(*args, **kwargs) 547 sig = inspect.signature(func) 548 params: Dict[str, Any] = dict(**dict(zip(sig.parameters, args)), **kwargs)
File /local_disk0/.ephemeral_nfs/envs/pythonEnv-2534a48d-f9f7-4daf-934e-2d9e08931bf6/lib/python3.10/site-packages/unstructured/chunking/title.py:297, in add_chunking_strategy.<locals>.decorator.<locals>.wrapper(*args, **kwargs) 295 @functools.wraps(func) 296 def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> List[Element]: --> 297 elements = func(*args, **kwargs) 298 sig = inspect.signature(func) 299 params: Dict[str, Any] = dict(**dict(zip(sig.parameters, args)), **kwargs)
File /local_disk0/.ephemeral_nfs/envs/pythonEnv-2534a48d-f9f7-4daf-934e-2d9e08931bf6/lib/python3.10/site-packages/unstructured/partition/pdf.py:183, in partition_pdf(filename, file, include_page_breaks, strategy, infer_table_structure, ocr_languages, languages, include_metadata, metadata_filename, metadata_last_modified, chunking_strategy, links, extract_images_in_pdf, image_output_dir_path, **kwargs) 177 languages = convert_old_ocr_languages_to_languages(ocr_languages) 178 logger.warning( 179 "The ocr_languages kwarg will be deprecated in a future version of unstructured. " 180 "Please use languages instead.", 181 ) --> 183 return partition_pdf_or_image( 184 filename=filename, 185 file=file, 186 include_page_breaks=include_page_breaks, 187 strategy=strategy, 188 infer_table_structure=infer_table_structure, 189 languages=languages, 190 metadata_last_modified=metadata_last_modified, 191 extract_images_in_pdf=extract_images_in_pdf, 192 image_output_dir_path=image_output_dir_path, 193 **kwargs, 194 )
File /local_disk0/.ephemeral_nfs/envs/pythonEnv-2534a48d-f9f7-4daf-934e-2d9e08931bf6/lib/python3.10/site-packages/unstructured/partition/pdf.py:288, in partition_pdf_or_image(filename, file, is_image, include_page_breaks, strategy, infer_table_structure, ocr_languages, languages, metadata_last_modified, extract_images_in_pdf, image_output_dir_path, **kwargs) 271 last_modification_date = get_the_last_modification_date_pdf_or_img( 272 file=file, 273 filename=filename, 274 ) 276 if ( 277 not is_image 278 and determine_pdf_or_image_strategy( (...) 286 != "ocr_only" 287 😞 --> 288 extracted_elements = extractable_elements( 289 filename=filename, 290 file=spooled_to_bytes_io_if_needed(file), 291 include_page_breaks=include_page_breaks, 292 metadata_last_modified=metadata_last_modified or last_modification_date, 293 **kwargs, 294 ) 295 pdf_text_extractable = any( 296 isinstance(el, Text) and el.text.strip() for el in extracted_elements 297 ) 298 else:
File /local_disk0/.ephemeral_nfs/envs/pythonEnv-2534a48d-f9f7-4daf-934e-2d9e08931bf6/lib/python3.10/site-packages/unstructured/partition/pdf.py:206, in extractable_elements(filename, file, include_page_breaks, metadata_last_modified, **kwargs) 204 if isinstance(file, bytes😞 205 file = io.BytesIO(file) --> 206 return _partition_pdf_with_pdfminer( 207 filename=filename, 208 file=file, 209 include_page_breaks=include_page_breaks, 210 metadata_last_modified=metadata_last_modified, 211 **kwargs, 212 )
File /local_disk0/.ephemeral_nfs/envs/pythonEnv-2534a48d-f9f7-4daf-934e-2d9e08931bf6/lib/python3.10/site-packages/unstructured/utils.py:179, in requires_dependencies.<locals>.decorator.<locals>.wrapper(*args, **kwargs) 170 if len(missing_deps) > 0: 171 raise ImportError( 172 f"Following dependencies are missing: {', '.join(missing_deps)}. " 173 + ( (...) 177 ), 178 ) --> 179 return func(*args, **kwargs)
File /local_disk0/.ephemeral_nfs/envs/pythonEnv-2534a48d-f9f7-4daf-934e-2d9e08931bf6/lib/python3.10/site-packages/unstructured/partition/pdf.py:525, in _partition_pdf_with_pdfminer(filename, file, include_page_breaks, metadata_last_modified, **kwargs) 523 elif file: 524 fp = cast(BinaryIO, file) --> 525 elements = _process_pdfminer_pages( 526 fp=fp, 527 filename=filename, 528 include_page_breaks=include_page_breaks, 529 metadata_last_modified=metadata_last_modified, 530 **kwargs, 531 ) 533 return elements
File /local_disk0/.ephemeral_nfs/envs/pythonEnv-2534a48d-f9f7-4daf-934e-2d9e08931bf6/lib/python3.10/site-packages/unstructured/partition/pdf.py:613, in _process_pdfminer_pages(fp, filename, include_page_breaks, metadata_last_modified, sort_mode, **kwargs) 611 if _text.strip(): 612 points = ((x1, y1), (x1, y2), (x2, y2), (x2, y1)) --> 613 element = element_from_text( 614 _text, 615 coordinates=points, 616 coordinate_system=coordinate_system, 617 ) 618 coordinates_metadata = CoordinatesMetadata( 619 points=points, 620 system=coordinate_system, 621 ) 623 links: List[Link] = []
File /local_disk0/.ephemeral_nfs/envs/pythonEnv-2534a48d-f9f7-4daf-934e-2d9e08931bf6/lib/python3.10/site-packages/unstructured/partition/text.py:235, in element_from_text(text, coordinates, coordinate_system) 229 elif is_possible_numbered_list(text): 230 return ListItem( 231 text=text, 232 coordinates=coordinates, 233 coordinate_system=coordinate_system, 234 ) --> 235 elif is_possible_narrative_text(text): 236 return NarrativeText( 237 text=text, 238 coordinates=coordinates, 239 coordinate_system=coordinate_system, 240 ) 241 elif is_possible_title(text):
File /local_disk0/.ephemeral_nfs/envs/pythonEnv-2534a48d-f9f7-4daf-934e-2d9e08931bf6/lib/python3.10/site-packages/unstructured/partition/text_type.py:87, in is_possible_narrative_text(text, cap_threshold, non_alpha_threshold, languages, language_checks) 84 if under_non_alpha_ratio(text, threshold=non_alpha_threshold): 85 return False ---> 87 if "eng" in languages and (sentence_count(text, 3) < 2) and (not contains_verb(text)😞 88 trace_logger.detail(f"Not narrative. Text does not contain a verb:\n\n{text}") # type: ignore # noqa: E501 89 return False
File /local_disk0/.ephemeral_nfs/envs/pythonEnv-2534a48d-f9f7-4daf-934e-2d9e08931bf6/lib/python3.10/site-packages/unstructured/partition/text_type.py:189, in contains_verb(text) 186 if text.isupper(): 187 text = text.lower() --> 189 pos_tags = pos_tag(text) 190 return any(tag in POS_VERB_TAGS for _, tag in pos_tags)
File /local_disk0/.ephemeral_nfs/envs/pythonEnv-2534a48d-f9f7-4daf-934e-2d9e08931bf6/lib/python3.10/site-packages/unstructured/nlp/tokenize.py:55, in pos_tag(text) 53 for sentence in sentences: 54 tokens = _word_tokenize(sentence) ---> 55 parts_of_speech.extend(_pos_tag(tokens)) 56 return parts_of_speech
File /local_disk0/.ephemeral_nfs/envs/pythonEnv-2534a48d-f9f7-4daf-934e-2d9e08931bf6/lib/python3.10/site-packages/nltk/tag/__init__.py:165, in pos_tag(tokens, tagset, lang) 140 def pos_tag(tokens, tagset=None, lang="eng"😞 141 """ 142 Use NLTK's currently recommended part of speech tagger to 143 tag the given list of tokens. (...) 163 :rtype: list(tuple(str, str)) 164 """ --> 165 tagger = _get_tagger(lang) 166 return _pos_tag(tokens, tagset, tagger, lang)
File /local_disk0/.ephemeral_nfs/envs/pythonEnv-2534a48d-f9f7-4daf-934e-2d9e08931bf6/lib/python3.10/site-packages/nltk/tag/__init__.py:107, in _get_tagger(lang) 105 tagger = PerceptronTagger(lang=lang) 106 else: --> 107 tagger = PerceptronTagger() 108 return tagger
File /local_disk0/.ephemeral_nfs/envs/pythonEnv-2534a48d-f9f7-4daf-934e-2d9e08931bf6/lib/python3.10/site-packages/nltk/tag/perceptron.py:183, in PerceptronTagger.__init__(self, load, lang) 181 self.classes = set() 182 if load: --> 183 self.load_from_json(lang)
File /local_disk0/.ephemeral_nfs/envs/pythonEnv-2534a48d-f9f7-4daf-934e-2d9e08931bf6/lib/python3.10/site-packages/nltk/tag/perceptron.py:273, in PerceptronTagger.load_from_json(self, lang) 271 def load_from_json(self, lang="eng"😞 272 # Automatically find path to the tagger if location is not specified. --> 273 loc = find(f"taggers/averaged_perceptron_tagger_{lang}/") 274 with open(loc + TAGGER_JSONS[lang]["weights"]) as fin: 275 self.model.weights = json.load(fin)
File /local_disk0/.ephemeral_nfs/envs/pythonEnv-2534a48d-f9f7-4daf-934e-2d9e08931bf6/lib/python3.10/site-packages/nltk/data.py:582, in find(resource_name, paths) 580 sep = "*" * 70 581 resource_not_found = f"\n{sep}\n{msg}\n{sep}\n" --> 582 raise LookupError(resource_not_found)