File size: 1,298 Bytes
46290fc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33

from unstructured.partition.html import partition_html
#source = 'https://www.linkedin.com/posts/jobanpreet-singh-392581207_asr-whisper-speechrecognition-activity-7172803455718158336-MC-j?utm_source=share&utm_medium=member_desktop'


def scrappost(url):
    all_groups = []
    group = {'page_content': ''}

    # ingest and preprocess webpage into Unstructured elements object
    glossary_page = partition_html(url=url)

    # iterate the document elements and group texts by title
    for element in glossary_page:
        if 'unstructured.documents.html.HTMLTitle' in str(type(element)):
        # If there's already content in the group, add it to all_groups
            if group['page_content']:
                all_groups.append(group)
                group = {'page_content': ''}
                group['page_content'] += element.text
        if 'unstructured.documents.html.HTMLNarrativeText' in str(type(element)):
            group['page_content'] += element.text

        if "unstructured.documents.html.HTMLListItem" in str(type(element)):
            group['page_content']+=element.text

    # # Add the last group if it exists
    if group['page_content']:
        all_groups.append(group)

    # Print the groups
    for group in all_groups[:1]:
        return group["page_content"]