Spaces:
Runtime error
Runtime error
import os | |
# can be used to add metadata to the index, for instance URL | |
metadata_by_file_path = { | |
"data/Daoism/Tao_Te_Ching.pdf": {"url": "https://www.with.org/tao_te_ching_en.pdf"}, | |
"data/Confucianism/Analects of Confucius.pdf": { | |
"url": "https://chinatxt.sitehost.iu.edu/Analects_of_Confucius_(Eno-2015).pdf" | |
}, | |
} | |
def get_domains(): | |
domains = [] | |
for root, dirs, files in os.walk("data"): | |
for dir in dirs: | |
domains.append(dir) | |
return domains | |
def get_sources(): | |
res = [] | |
for root, dirs, files in os.walk("data"): | |
for file in files: | |
if file.endswith(".pdf"): | |
file_path = os.path.join(root, file) | |
print("file_path", file_path) | |
res.append( | |
{ | |
"domain": parse_domain(file_path), | |
"name": parse_name(file_path), | |
"file_path": file_path, | |
**metadata_by_file_path.get(file_path, {}), | |
} | |
) | |
return res | |
def parse_name(source: str) -> str: | |
filename = os.path.basename(source) | |
name, _ = os.path.splitext(filename) | |
return name.replace("_", " ") | |
def parse_domain(source: str) -> str: | |
domain = source.split(os.sep)[1] | |
return domain.replace("_", " ") | |