zinoubm commited on
Commit
db68fe3
·
1 Parent(s): fb30d5c

completing the scraping script

Browse files
Files changed (2) hide show
  1. .gitignore +3 -0
  2. document_scraping/scrape.py +21 -0
.gitignore CHANGED
@@ -127,3 +127,6 @@ dmypy.json
127
 
128
  # Pyre type checker
129
  .pyre/
 
 
 
 
127
 
128
  # Pyre type checker
129
  .pyre/
130
+
131
+ # the assets
132
+ documents/
document_scraping/scrape.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pdfplumber
2
+ from pathlib import Path
3
+ import os
4
+
5
+ input_path = Path("./documents")
6
+
7
+ file_names = os.listdir(input_path)
8
+
9
+ result = ""
10
+
11
+ for file_name in file_names:
12
+ pdf = pdfplumber.open(input_path / file_name)
13
+ for page in pdf.pages:
14
+ text = page.extract_text()
15
+ result += text
16
+
17
+ # encoding to ASCII will remove special caracters.
18
+ result = result.encode(encoding="ASCII", errors="ignore").decode()
19
+
20
+ with open(input_path / "result.txt", "w") as f:
21
+ f.write(result)