File size: 1,321 Bytes
360f505
 
e0169c8
 
 
 
 
 
360f505
e0169c8
 
 
 
 
 
 
 
 
 
 
 
 
360f505
e0169c8
 
 
 
 
 
 
 
 
 
360f505
e0169c8
 
360f505
 
e0169c8
 
 
 
360f505
e0169c8
 
 
 
360f505
e0169c8
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import shutil

from bs4 import BeautifulSoup
from markdown import markdown
import os
import re
from pathlib import Path

from settings import *


def markdown_to_text(markdown_string):
    """ Converts a markdown string to plaintext """

    # md -> html -> text since BeautifulSoup can extract text cleanly
    html = markdown(markdown_string)

    html = re.sub(r'<!--((.|\n)*)-->', '', html)
    html = re.sub('<code>bash', '<code>', html)

    # extract text
    soup = BeautifulSoup(html, "html.parser")
    text = ''.join(soup.findAll(string=True))

    text = re.sub('```(py|diff|python)', '', text)
    text = re.sub('```\n', '\n', text)
    text = re.sub('-         .*', '', text)
    text = text.replace('...', '')
    text = re.sub('\n(\n)+', '\n\n', text)

    return text


dir_to_scrape = Path(MARKDOWN_DIR_TO_SCRAPE)
files = list(dir_to_scrape.rglob("*"))

shutil.rmtree(TEXT_CHUNKS_DIR, ignore_errors=True)
os.makedirs(TEXT_CHUNKS_DIR)

for file in files:
    parent = file.parent.stem if file.parent.stem != dir_to_scrape.stem else ""
    if file.is_file():
        with open(file, encoding='utf-8') as f:
            md = f.read()

        text = markdown_to_text(md)

        with open(os.path.join(TEXT_CHUNKS_DIR, f"{parent}_{file.stem}.txt"), "w", encoding='utf-8') as f:
            f.write(text)