tag
+
+ fn = (
+ out_dir
+ + "/"
+ + data.get("name", "").replace(" ", "_")
+ + ".txt"
+ )
+ text_writer = TextWriter(text_save_fn=fn)
+
+ text = html2text.html2text(str(help_soup))
+
+ text_writer.write_txt("Title: " + title_url)
+ text_writer.write_txt("URL: " + url_link)
+ text_writer.write_txt(f"Language: {lang}")
+ text_writer.write_txt("-----")
+ text_writer.write_txt(content=text, newline=False)
+
+ # write version.txt file
+ version_writer = KleverVersion()
+ version_filepath = f"{out_dir}/_version.txt"
+ text_writer = TextWriter(version_filepath)
+ text_writer.write_txt("Title: KleverBot version")
+ text_writer.write_txt("-----")
+ text_writer.write_txt(version_writer.get_en_version())
+ text_writer.write_txt(version_writer.get_jp_version(), newline=False)
+
+ # remove previous scraped folder except _appendix
+
+ training_folder = "../../data/Klever/training_files/"
+ model_folder = "../../models/Klever"
+ # Iterate through the list of files and remove them
+ for dirpath, dirnames, filenames in os.walk(training_folder):
+ for filename in filenames:
+ # skip _appendix
+ if filename == "_appendix.txt":
+ continue
+
+ file_path = os.path.join(dirpath, filename)
+ if os.path.isfile(file_path):
+ # remove with git for commit
+ subprocess.run(
+ ["git", "rm", file_path],
+ check=False,
+ stdout=subprocess.DEVNULL,
+ stderr=subprocess.STDOUT,
+ )
+
+ # copy version file to models folder
+ shutil.copy(version_filepath, model_folder)
+
+ # add scraped contents to folder for training
+ shutil.copytree(
+ common_config["klever_scraped_folder"],
+ training_folder,
+ dirs_exist_ok=True,
+ )
diff --git a/src/scraper/tt_scraper.py b/src/scraper/tt_scraper.py
new file mode 100644
index 0000000000000000000000000000000000000000..403f9bc609e0307d2d86517dd45b6b738a390f83
--- /dev/null
+++ b/src/scraper/tt_scraper.py
@@ -0,0 +1,1437 @@
+import ast
+import configparser
+import copy
+import json
+import logging
+import os
+import re
+import shutil
+import subprocess
+import sys
+from collections import deque
+from datetime import datetime
+from urllib.parse import urlparse
+
+import bs4
+import requests
+from bs4 import BeautifulSoup
+from tqdm import tqdm
+
+HIRAGANA = re.compile("[\u3040-\u309F]")
+KATAKANA = re.compile("[\u30A0-\u30FF]")
+CJK = re.compile("[\u4300-\u9faf]")
+
+
+class DirectorySetup:
+ """Text writer for preparing folder for scraping"""
+
+ def __init__(self, common_cfg):
+ self.common_cfg = common_cfg
+ self.scraped_folder = common_cfg["scraped_folder"]
+ self.scraped_debug_folder = common_cfg["scraped_debug_folder"]
+ self.scraped_postprocessing_folder = common_cfg[
+ "scraped_postprocessing_folder"
+ ]
+
+ self.new_folders = {
+ "scraped_folder": self.scraped_folder,
+ }
+ if common_cfg.getboolean("postprocessing_scrape"):
+ self.new_folders[
+ "scraped_postprocessing_folder"
+ ] = self.scraped_postprocessing_folder
+ if common_cfg.getboolean("debug"):
+ self.new_folders[
+ "scraped_debug_folder"
+ ] = self.scraped_debug_folder
+
+ self.create_dir()
+
+ def create_dir(self):
+ for key_name, folders in self.new_folders.copy().items():
+ for lang in ast.literal_eval(self.common_cfg["website_lang"]):
+ folder_name = f"{folders}/{lang}"
+
+ if self.common_cfg.getboolean("remake_scrape"):
+ if os.path.exists(folder_name):
+ shutil.rmtree(folder_name)
+
+ os.makedirs(folder_name)
+
+ self.new_folders[f"{key_name}_{lang}"] = folder_name
+
+ def get_scraped_folder(self, lang):
+ return self.new_folders[f"scraped_folder_{lang}"]
+
+ def get_scraped_debug_folder(self, lang):
+ return self.new_folders[f"scraped_debug_folder_{lang}"]
+
+ def get_scraped_postprocessing_folder_folder(self, lang):
+ return self.new_folders[f"scraped_postprocessing_folder_{lang}"]
+
+
+class TextWriter:
+ """Text writer for writing text to file"""
+
+ def __init__(self, text_save_fn="website_content.txt", urls=None):
+ self.save_fn = text_save_fn
+ self.urls = urls
+
+ def write_txt(self, content, newline=True):
+ """Write content to file in append mode"""
+ with open(self.save_fn, "a", encoding="utf-8") as outfile:
+ outfile.write(content)
+ if newline:
+ outfile.write("\n")
+
+
+class TTBotVersion:
+ def __init__(self):
+ self.date = datetime.now()
+ self.en_version = (
+ "- Training data includes information up until {en_date}"
+ )
+ self.jp_version = "- 訓練データには{jp_date}までの情報が含まれています"
+
+ def get_en_version(self):
+ return self.en_version.format(en_date=self.date.strftime("%b %d"))
+
+ def get_jp_version(self):
+ return self.jp_version.format(jp_date=self.date.strftime("%m月%d日"))
+
+
+class HtmlTextParser:
+ """Parser to extract from html"""
+
+ def __init__(
+ self,
+ urls,
+ is_remove_header=True,
+ is_remove_footer=True,
+ is_remove_comment=True,
+ is_remove_span=True,
+ text_wtr=None,
+ debug_text_wtr=None,
+ common_cfg=None,
+ ):
+ self.urls = urls
+ self.text_wtr = text_wtr
+ self.is_remove_header = is_remove_header
+ self.is_remove_footer = is_remove_footer
+ self.is_remove_comment = is_remove_comment
+ self.is_remove_span = is_remove_span
+ self.common_cfg = common_cfg
+ self.debug_text_wtr = debug_text_wtr
+ self.response = self.request_url(self.urls)
+ self.soup = self.parse_response_body()
+
+ def request_url(self, urls):
+ """Request to an url, remove comment within response"""
+ # Initiate a session and update the headers.
+ sess = requests.session()
+ headers = {
+ "accept-language": "ja-JP,ja;q=0.9,en;q=0.8,zh-TW;q=0.7",
+ "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_0) "
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
+ "Chrome/76.0.3809.132 Safari/537.36",
+ }
+ sess.headers.update(headers)
+ res = sess.get(urls, timeout=10).content.decode("utf-8")
+ return self.remove_comment(res) if self.is_remove_comment else res
+
+ def parse_response_body(self):
+ """Parse HTML with beautiful soup"""
+ res = BeautifulSoup(self.response, "html.parser")
+ if self.is_remove_span:
+ res = self.remove_span(res)
+ if self.is_remove_header:
+ res = self.remove_head_or_footer(
+ res,
+ is_header=True,
+ is_footer=False,
+ )
+ if self.is_remove_footer:
+ res = self.remove_head_or_footer(
+ res,
+ is_header=False,
+ is_footer=True,
+ )
+ return res
+
+ def remove_head_or_footer(
+ self,
+ orig_str,
+ is_header=False,
+ is_footer=False,
+ ):
+ """Remove header or footer element depending on the flag"""
+ section = []
+ if is_header:
+ section.append("header")
+ elif is_footer:
+ section.append("footer")
+
+ for each in section:
+ sub_section = orig_str.find(each)
+ if sub_section:
+ sub_section.extract()
+ return orig_str
+
+ def remove_comment(self, orig_str):
+ """Remove comment within html using regrex"""
+ return re.sub("()", "", orig_str, flags=re.DOTALL)
+
+ def recursive_extract_text_html(self):
+ """Recursively loop through html tag to get the text"""
+ if not self.text_wtr:
+ logger.warning(
+ "Cannot extract text if there is no text writer, skip "
+ "extracting...",
+ )
+ else:
+ extract_html(
+ underlying_write=self.text_wtr,
+ underlying_debug_write=self.debug_text_wtr,
+ html=self.soup,
+ common_cfg=self.common_cfg,
+ soup_url=self.urls,
+ )
+
+ def remove_span(self, orig_str):
+ """Remove span, br and em tag"""
+ for span_tag in orig_str.findAll("span"):
+ span_tag.unwrap()
+ for br_tag in orig_str.findAll("br"):
+ br_tag.unwrap()
+ for em_tag in orig_str.findAll("em"):
+ em_tag.unwrap()
+ return orig_str
+
+
+class TTUrlScraper:
+ """Tokyo Techies Websites URL scraper"""
+
+ def __init__(
+ self,
+ root_uri="https://www.tokyotechies.com",
+ root_uri_jp="https://www.tokyotechies.com/ja",
+ bad_url=None,
+ common_cfg=None,
+ ):
+ self.root_uri = root_uri
+ self.root_uri_jp = root_uri_jp
+ self.url_set = {
+ self.clean_url(self.root_uri),
+ self.clean_url(self.root_uri_jp),
+ }
+ self.url_dict = {
+ "0": {
+ "url": self.clean_url(self.root_uri),
+ "url_summary": "Homepage",
+ },
+ "1": {
+ "url": self.clean_url(self.root_uri_jp),
+ "url_summary": "Homepage in Japanese",
+ },
+ }
+ self.tt_url_dict = self.url_dict.copy()
+ self.na_url_list = set()
+
+ self.bad_url = bad_url
+ self.common_cfg = common_cfg
+
+ def clean_url(self, url):
+ """Clean an url by removing its last character with "/" and "#" """
+ if self.verify_uri(url):
+ if "/" in url[-1] or "#" in url[-1]:
+ url = url[:-1]
+ if self.verify_uri(url):
+ return url
+ else:
+ return url
+ else:
+ logger.error("Clean %s failed", url)
+
+ def parse_uri(self):
+ """Parse links within URL using bread first search"""
+ queue = deque([self.clean_url(self.root_uri)])
+
+ while queue:
+ next_uri_list = []
+
+ for _ in range(len(queue)):
+ uri = queue.popleft()
+
+ logger.info("Parsing URL %s...", uri)
+ html_parser = HtmlTextParser(
+ urls=self.clean_url(uri),
+ is_remove_footer=False,
+ is_remove_header=False,
+ is_remove_comment=False,
+ is_remove_span=False,
+ )
+ next_uri_list = self.get_list_url_from_uri(html_parser, uri)
+ if next_uri_list:
+ queue.extend(next_uri_list)
+
+ # List of all available URL
+ # self.dump_uri(self.url_dict)
+ # List of all Tokyo Techies domain URL
+ # self.dump_uri(self.tt_url_dict, "tt_urls.json")
+ # List of all unknown domain URL
+ # self.dump_na_uri(self.na_url_list)
+
+ return self.tt_url_dict
+
+ def get_list_url_from_uri(self, html_parser, orig_uri):
+ """Parse all links from a tag of a soup html, check if the link already
+ visited if not visited add to the queue"""
+ try:
+ html_soup = html_parser.soup
+ queue_list = []
+
+ for line in html_soup.find_all("a"):
+ link = line.get("href")
+ if (
+ link
+ and link[0] != "#"
+ and "mailto:" not in link
+ and "tel:" not in link
+ ):
+ if "#" in link:
+ link = link[: link.index("#")]
+
+ if link[0] == "/":
+ # make it a full links in case it a partial controller
+ link = self.clean_url(self.root_uri) + link
+ elif link[0] == "?":
+ # incase it is a next page
+ link = self.clean_url(orig_uri) + link
+
+ # clean link
+ clean_link = None
+ clean_link = self.clean_url(link)
+
+ if (
+ clean_link
+ and self.verify_uri(clean_link)
+ and clean_link not in self.url_set
+ ):
+ # split ? in case the next page and previous page
+ # are already in the set
+ if "?" in clean_link and clean_link.count("?") > 1:
+ clean_link = "?".join(clean_link.split("?")[:-1])
+ if clean_link in self.url_set:
+ continue
+ # for content that is above tbd character
+ link_text = extract_html(
+ line,
+ common_cfg=self.common_cfg,
+ ) # line.text
+ text_json = {
+ "url": clean_link,
+ "url_summary": link_text,
+ }
+
+ self.url_set.add(clean_link)
+ # only BFS through Tokyo Techies domain
+ if (
+ clean_link.startswith(
+ "https://www.tokyotechies.com",
+ )
+ and clean_link not in self.bad_url
+ ):
+ queue_list.append(clean_link)
+ self.tt_url_dict[
+ str(len(self.tt_url_dict))
+ ] = text_json
+
+ if (
+ link_text == ""
+ and clean_link not in self.na_url_list
+ ):
+ self.na_url_list.add(clean_link)
+ else:
+ self.url_dict[str(len(self.url_dict))] = text_json
+ return queue_list
+
+ except Exception:
+ logger.error("Parsing URI failed", exc_info=True)
+
+ def check_200(self, uri):
+ """Check if the link is accessiable 200 OK"""
+ headers = {
+ "accept-language": "ja-JP,ja;q=0.9,en;q=0.8,zh-TW;q=0.7",
+ "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_0) "
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
+ "Chrome/76.0.3809.132 Safari/537.36",
+ }
+ res = requests.get(uri, headers=headers, timeout=10)
+ return res.status_code == 200
+
+ def verify_uri(self, uri):
+ """Verify url by checking whether uri is accessible"""
+ try:
+ result = urlparse(uri)
+ return all([result.scheme, result.netloc])
+ except Exception:
+ logger.error(f"Bad Url: {uri}", exc_info=True)
+ return False
+
+ def dump_uri(self, uri_dict, filename="urls.json"):
+ """Dump URIs dictionary into json file"""
+ with open(filename, "w", encoding="utf-8") as file_writer:
+ json.dump(
+ uri_dict,
+ file_writer,
+ ensure_ascii=False,
+ indent=4,
+ sort_keys=False,
+ )
+
+ def dump_na_uri(self, na_uri, filename="unk_urls.txt"):
+ """Dump unknown URIs dictionary into json file"""
+ with open(filename, "w", encoding="utf-8") as file_writer:
+ for uri in na_uri:
+ file_writer.write(f"{uri}\n")
+
+
+class SoupRecursiveParser:
+ """Recusive extracting text content from bs4"""
+
+ def __init__(
+ self,
+ soup,
+ underlying_write,
+ special_character="",
+ orig_soup=None,
+ check_logo=False,
+ check_footer=False,
+ check_cert=False,
+ # href link of a tag -> to be updated for special_word
+ tag_a_url=None,
+ trans=None,
+ postprocess_tag=False,
+ url=None,
+ non_trans_tag=[],
+ special_words=[],
+ ):
+ self.soup = soup
+ self.underlying_write = underlying_write
+ self.orig_soup = orig_soup
+ self.url = url
+ # recursive variables
+ self.special_character = special_character
+ self.check_logo = check_logo
+ self.check_footer = check_footer
+ self.check_cert = check_cert
+ self.tag_a_url = (
+ tag_a_url # href link of a tag -> to be updated for special_word
+ )
+ self.trans = trans
+ self.postprocess_tag = postprocess_tag
+
+ # constant variable
+ self.non_trans_tag = non_trans_tag
+ self.layout_grid_alt = "grid_cls"
+ self.special_word = special_words
+ self.class_separation = {
+ "container": self.separate_container,
+ "content-area": self.separate_content_area_or_layout_grid,
+ "w-layout-grid": self.separate_content_area_or_layout_grid,
+ "hero-heading": self.merge_heading,
+ "customers-list": self.extract_logo,
+ "white-devider": self.add_white_divider,
+ "klever-banner": self.add_h2_klever_banner,
+ "footer-contact": self.extract_footer_contact,
+ "certification": self.add_certification,
+ }
+
+ def recursive_extract(
+ self,
+ c_soup,
+ c_special_character,
+ c_check_logo,
+ c_check_footer,
+ c_check_cert,
+ c_trans,
+ c_tag_a_url=None,
+ ):
+ """Recursively extract the html"""
+ # if soup is a string -> write txt
+ if self.is_bs4_string(c_soup):
+ c_tag_a_url = self.preprocess_string(
+ c_soup,
+ c_special_character,
+ c_trans,
+ c_tag_a_url,
+ )
+ # TODO: check if should we return here everytime we find a string
+ return
+
+ # process using class name
+ elif c_soup.get("class") is not None:
+ # add new line for between container class
+ if "container" in c_soup.get("class"):
+ self.process_class("container")
+ # add section for content area in about us
+ # elif "content-area" in c_soup.get("class"):
+ # self.process_class(
+ # "content-area",
+ # soup=c_soup,
+ # new_text="\nSub Section: ",
+ # )
+ # add separate klever banner infor
+ elif "klever-banner" in c_soup.get("class"):
+ self.process_class(
+ "klever-banner",
+ soup=c_soup,
+ new_text="Klever",
+ )
+ # add white space for white-devider class:
+ elif "white-devider" in c_soup.get("class"):
+ self.process_class(
+ "white-devider",
+ soup=c_soup,
+ new_text="-----",
+ )
+ # add for layout grid to add newline between
+ elif "w-layout-grid" in c_soup.get("class"):
+ # skip for layout grid special case
+ if "company-grid" in c_soup.get("class"):
+ pass
+ else:
+ self.process_class(
+ "w-layout-grid",
+ soup=c_soup,
+ new_text="grid_cls",
+ )
+ elif "post-list" in c_soup.get("class"):
+ self.process_class(
+ "w-layout-grid",
+ soup=c_soup,
+ new_text="grid_cls",
+ )
+ elif "blog-collection-grid" in c_soup.get("class"):
+ self.process_class(
+ "w-layout-grid",
+ soup=c_soup,
+ new_text="grid_cls",
+ )
+ elif "w-slide" in c_soup.get("class"):
+ self.process_class(
+ "w-layout-grid",
+ soup=c_soup,
+ new_text="grid_cls",
+ )
+ # parse for heading
+ elif "hero-heading" in c_soup.get("class") and c_soup.name == "h1":
+ self.process_class(
+ "hero-heading",
+ soup=c_soup,
+ special_character=c_special_character,
+ check_logo=c_check_logo,
+ check_footer=c_check_footer,
+ check_cert=c_check_cert,
+ tag_a_url=c_tag_a_url,
+ )
+ return
+ # parse contact us icon
+ elif "footer-contact" in c_soup.get("class"):
+ c_check_footer = True
+ # parse certification in image footer
+ elif "certification" in c_soup.get("class"):
+ c_check_cert = True
+
+ # below are conventional small logic
+ # use this logic to search for the image
+ if "customers-list" in c_soup.get("class"):
+ c_check_logo = True
+ # skip some bad text
+ if "text-block-51" in c_soup.get("class"):
+ return
+ # skip duplicate successful stories
+ if "home-case-study-section" in c_soup.get("class"):
+ return
+ # skip news section class which are displayed None
+ if "news-section" in c_soup.get("class"):
+ return
+ # skip duplicate footer for desktop because
+ # certification design is better
+ if "desktop-footer-menu" in c_soup.get("class"):
+ return
+ # skip duplicate what separates us from the rest
+ if "separates-img" in c_soup.get("class"):
+ return
+ if "hidden-tablet" in c_soup.get("class"):
+ return
+ # skip blog container
+ if "blogs-copy" in c_soup.get(
+ "class",
+ ) and "wf-section" in c_soup.get("class"):
+ return
+ # skip form submission
+ if "contact-form-text" in c_soup.get("class"):
+ return
+ # skip contact us text
+ if c_soup.get("id") and "contact-us" in c_soup.get("id"):
+ return
+ # skip member class since they are duplicated content in see
+ # more of about us
+ if "key-persions" in c_soup.get(
+ "class",
+ ) or "team-members" in c_soup.get("class"):
+ return
+ # skip the cta except for the about us page:
+ if "cta" in c_soup.get("class") and self.url not in CTA_PAGE:
+ return
+
+ # check img tag in "customers-list"
+ if c_check_logo and c_soup.name == "img":
+ self.process_class(
+ "customers-list",
+ soup=c_soup,
+ special_character=c_special_character,
+ )
+ return
+ # check a tag in "footer-contact"
+ elif c_check_footer and c_soup.name == "img":
+ self.process_class(
+ "footer-contact",
+ soup=c_soup,
+ special_character=c_special_character,
+ )
+ return
+ elif c_check_cert and c_soup.name == "img":
+ self.process_class(
+ "certification",
+ soup=c_soup,
+ new_text="Certification: ",
+ )
+ return
+ # main recursive to extract content
+ elif hasattr(c_soup, "contents"):
+ for content in c_soup.contents:
+ translateable = (
+ c_soup.name not in self.non_trans_tag
+ ) # and soup.get('translate') != "no"
+ if translateable:
+ # skip the text
+ if content == "html":
+ continue
+ # extract the unorder list text
+ if c_soup.name == "ul":
+ c_special_character = "- "
+ # add Tokyo Techies to title
+ if c_soup.name == "title" and isinstance(
+ content,
+ bs4.element.NavigableString,
+ ):
+ if not str(content).startswith("Tokyo Techies"):
+ content = self.convert_bs4_string(
+ "Tokyo Techies: " + str(content),
+ )
+ # extract link from
tag with text
+ if c_soup.name == "a" and isinstance(
+ content,
+ bs4.element.NavigableString,
+ ):
+ if c_soup.get("href", "").startswith("http"):
+ tmp = c_soup.get("href", "")
+ content = bs4.element.NavigableString(
+ str(content) + f": see more at {tmp}",
+ )
+ else:
+ tmp = self.underlying_write.urls + c_soup.get(
+ "href",
+ "",
+ )
+ content = bs4.element.NavigableString(
+ str(content) + f": see more at {tmp}",
+ )
+ # separated recursion to attach links to text
+ # (see more, read at, ...)
+ elif (
+ c_soup.name == "a"
+ and c_soup.get("href", None) is not None
+ ):
+ c_tag_a_url = c_soup.get("href")
+
+ self.recursive_extract(
+ c_soup=content,
+ c_trans=c_soup.name,
+ c_special_character=c_special_character,
+ c_check_logo=c_check_logo,
+ c_check_footer=c_check_footer,
+ c_check_cert=c_check_cert,
+ c_tag_a_url=c_tag_a_url,
+ )
+
+ def process_class(self, tag_cls_name, **kwargs):
+ return self.class_separation[tag_cls_name](**kwargs)
+
+ def separate_container(self, **kwargs):
+ self.underlying_write.write_txt("")
+ # Remove Section Text
+ # self.underlying_write.write_txt("Section: ", False)
+
+ def separate_content_area_or_layout_grid(self, **kwargs):
+ soup = kwargs["soup"]
+ new_text = kwargs["new_text"]
+
+ if hasattr(soup, "contents"):
+ for content in soup.contents:
+ if soup.name not in self.non_trans_tag:
+ new_tag = self.orig_soup.new_tag("div")
+ new_tag.append(new_text)
+ content.wrap(new_tag)
+
+ def add_h2_klever_banner(self, **kwargs):
+ soup = kwargs["soup"]
+ new_text = kwargs["new_text"]
+
+ if hasattr(soup, "contents"):
+ for content in soup.contents:
+ if soup.name not in self.non_trans_tag:
+ new_tag = self.orig_soup.new_tag("h2")
+ new_tag.append(new_text)
+ content.wrap(new_tag)
+
+ def add_white_divider(self, **kwargs):
+ soup = kwargs["soup"]
+ new_text = kwargs["new_text"]
+
+ soup.append(BeautifulSoup(f"{new_text}
", "html.parser"))
+
+ def merge_heading(self, **kwargs):
+ soup = kwargs["soup"]
+ special_character = kwargs["special_character"]
+ check_logo = kwargs["check_logo"]
+ check_footer = kwargs["check_footer"]
+ check_cert = kwargs["check_cert"]
+ tag_a_url = kwargs["tag_a_url"]
+
+ self.recursive_extract(
+ c_soup=bs4.element.NavigableString(soup.text),
+ c_trans=soup.name,
+ c_special_character=special_character,
+ c_check_logo=check_logo,
+ c_check_footer=check_footer,
+ c_check_cert=check_cert,
+ c_tag_a_url=tag_a_url,
+ )
+
+ def extract_logo(self, **kwargs):
+ soup = kwargs["soup"]
+ special_character = kwargs["special_character"]
+
+ if hasattr(soup, "alt"):
+ alt_txt = soup.get("alt")
+ if "logo" in alt_txt:
+ alt_txt = HIRAGANA.sub("", alt_txt)
+ alt_txt = KATAKANA.sub("", alt_txt)
+ alt_txt = CJK.sub("", alt_txt)
+ alt_txt = (
+ alt_txt.replace("logo", "").rstrip().lstrip()
+ ) # .replace("ロゴ", "")
+ if alt_txt != "":
+ self.recursive_extract(
+ c_soup=bs4.element.NavigableString(
+ "- " + alt_txt.capitalize(),
+ ),
+ c_trans=soup.name,
+ c_special_character=special_character,
+ c_check_logo=False,
+ c_check_footer=False,
+ c_check_cert=False,
+ )
+
+ def add_certification(self, **kwargs):
+ soup = kwargs["soup"]
+ new_text = kwargs["new_text"]
+
+ if hasattr(soup, "alt"):
+ alt_txt = soup.get("alt")
+ self.underlying_write.write_txt(new_text, False)
+ self.underlying_write.write_txt(alt_txt)
+
+ def extract_footer_contact(self, **kwargs):
+ soup = kwargs["soup"]
+ special_character = kwargs["special_character"]
+
+ if hasattr(soup, "alt"):
+ alt_txt = soup.get("alt")
+ if alt_txt:
+ alt_txt = HIRAGANA.sub("", alt_txt)
+ alt_txt = KATAKANA.sub("", alt_txt)
+ alt_txt = CJK.sub("", alt_txt)
+ alt_txt = alt_txt.replace("icon", "").rstrip().lstrip()
+ if alt_txt != "":
+ self.recursive_extract(
+ c_soup=bs4.element.NavigableString(
+ alt_txt.capitalize() + ": ",
+ ),
+ c_trans=soup.name,
+ c_special_character=special_character,
+ c_check_logo=False,
+ c_check_footer=False,
+ c_check_cert=False,
+ )
+
+ def preprocess_string(
+ self,
+ soup_string: bs4.element.NavigableString,
+ c_special_character: str,
+ c_trans: str,
+ c_tag_a_url: str,
+ ):
+ text = str(soup_string)
+ # check if text is null, newline and space in unicode
+ if text != "" and text != "\n" and text != "\u200d":
+ # Special case:
+ # add a new line for next children inside w-layout-grid
+ if text == self.layout_grid_alt:
+ self.underlying_write.write_txt("")
+ # TODO: testing if we should still keep the tag_a_url here
+ # or default return None
+ return c_tag_a_url
+ else:
+ if c_tag_a_url is not None:
+ for special_w in self.special_word:
+ if text.startswith(special_w):
+ c_tag_a_url = (
+ c_tag_a_url
+ if c_tag_a_url.startswith(
+ "http",
+ )
+ else self.underlying_write.urls + c_tag_a_url
+ )
+ text = text + ": " + c_tag_a_url
+ if not self.postprocess_tag:
+ self.underlying_write.write_txt(
+ c_trans + ": " + c_special_character + text,
+ )
+ else:
+ self.postprocessing_on_tag(
+ c_special_character,
+ text,
+ c_trans,
+ )
+ return None
+
+ def is_bs4_string(self, element):
+ """Check if element is NavigableString"""
+ return isinstance(element, bs4.element.NavigableString)
+
+ def convert_bs4_string(self, text):
+ return bs4.element.NavigableString(text)
+
+ def postprocessing_on_tag(self, c_special_character, c_text, c_trans):
+ insert_text = c_special_character + c_text
+ if insert_text != "":
+ # logic to update tag here
+ c_tag = str(c_trans)
+ if c_tag == "title":
+ c_tag = "Title: "
+ elif c_tag == "h1":
+ # Uncomment old logic to get the format bullet again
+ # c_tag = "Header 1: "
+ c_tag = "# "
+ elif c_tag == "h2":
+ # c_tag = "Header 2: "
+ c_tag = "## "
+ elif c_tag == "h3":
+ # c_tag = "Header 3: "
+ c_tag = "### "
+ elif c_tag == "h4":
+ # c_tag = "Header 4: "
+ c_tag = "#### "
+ elif c_tag == "h5":
+ # c_tag = "Header 5: "
+ c_tag = "##### "
+ elif c_tag == "h6":
+ # c_tag = "Header 6: "
+ c_tag = "###### "
+ elif c_tag == "figcaption":
+ c_tag = "Image: "
+ elif c_tag in [
+ "div",
+ "a",
+ "p",
+ "ul",
+ "li",
+ "strong",
+ "article",
+ "em",
+ "img",
+ ]:
+ c_tag = ""
+ else:
+ c_tag = c_tag + ": "
+ # end logic to update tag
+ insert_text = c_tag + insert_text
+ if (
+ len(insert_text) == 5 and insert_text == "ul: -"
+ ): # should check if we can remove this
+ pass
+ else:
+ self.underlying_write.write_txt(insert_text)
+
+
+class BulletWriter:
+ def __init__(
+ self,
+ h1_c=None,
+ h2_c=None,
+ h3_c=None,
+ h4_c=None,
+ h5_c=None,
+ h6_c=None,
+ ):
+ self.h1_c = h1_c
+ self.h2_c = h2_c
+ self.h3_c = h3_c
+ self.h4_c = h4_c
+ self.h5_c = h5_c
+ self.h6_c = h6_c
+
+ def increment_h1(self):
+ self.h1_c = 1 if self.h1_c is None else self.h1_c + 1
+
+ def increment_h2(self):
+ self.h2_c = 1 if self.h2_c is None else self.h2_c + 1
+
+ def increment_h3(self):
+ self.h3_c = 1 if self.h3_c is None else self.h3_c + 1
+
+ def increment_h4(self):
+ self.h4_c = 1 if self.h4_c is None else self.h4_c + 1
+
+ def increment_h5(self):
+ self.h5_c = 1 if self.h5_c is None else self.h5_c + 1
+
+ def increment_h6(self):
+ self.h6_c = 1 if self.h6_c is None else self.h6_c + 1
+
+ def delete_child(self, child_list):
+ for child in child_list:
+ if child == "h1":
+ self.h1_c = None
+ elif child == "h2":
+ self.h2_c = None
+ elif child == "h3":
+ self.h3_c = None
+ elif child == "h4":
+ self.h4_c = None
+ elif child == "h5":
+ self.h5_c = None
+ elif child == "h6":
+ self.h6_c = None
+
+ def remove_child_h1(self):
+ self.delete_child(["h2", "h3", "h4", "h5", "h6"])
+
+ def remove_child_h2(self):
+ self.delete_child(["h3", "h4", "h5", "h6"])
+
+ def remove_child_h3(self):
+ self.delete_child(["h4", "h5", "h6"])
+
+ def remove_child_h4(self):
+ self.delete_child(["h5", "h6"])
+
+ def remove_child_h5(self):
+ self.delete_child(["h6"])
+
+ def format_bullet(self):
+ ret = ""
+ if self.h1_c:
+ ret += str(self.h1_c) + "."
+ if self.h2_c:
+ ret += str(self.h2_c) + "."
+ if self.h3_c:
+ ret += str(self.h3_c) + "."
+ if self.h4_c:
+ ret += str(self.h4_c) + "."
+ if self.h5_c:
+ ret += str(self.h5_c) + "."
+ if self.h6_c:
+ ret += str(self.h6_c) + "."
+ return ret
+
+
+def info(*argv):
+ """Info level log"""
+ logger.debug(str(argv))
+
+
+def error(*argv):
+ """Error level log"""
+ logger.error(str(argv))
+
+
+def extract_html(
+ html,
+ underlying_write=None,
+ common_cfg=None,
+ underlying_debug_write=None,
+ soup_url=None,
+):
+ """Recusively extract html file"""
+ # in case underlying_write is not None, it means writing to file
+ if underlying_write is not None:
+ html_debug = copy.copy(html)
+ recursive_parser = SoupRecursiveParser(
+ soup=html,
+ underlying_write=underlying_write,
+ special_character="",
+ orig_soup=html,
+ postprocess_tag=True,
+ url=soup_url,
+ non_trans_tag=ast.literal_eval(
+ common_cfg["non_translateable_tags"],
+ ),
+ special_words=ast.literal_eval(common_cfg["special_words"]),
+ )
+
+ recursive_parser.recursive_extract(
+ c_soup=recursive_parser.soup,
+ c_special_character=recursive_parser.special_character,
+ c_check_logo=recursive_parser.check_logo,
+ c_check_footer=recursive_parser.check_footer,
+ c_check_cert=recursive_parser.check_cert,
+ c_trans=recursive_parser.trans,
+ c_tag_a_url=recursive_parser.tag_a_url,
+ )
+
+ # debug settings
+ if (
+ common_cfg
+ and common_cfg.getboolean("debug")
+ and underlying_debug_write
+ ):
+ recursive_parser_debug = SoupRecursiveParser(
+ soup=html_debug,
+ underlying_write=underlying_debug_write,
+ special_character="",
+ orig_soup=html_debug,
+ postprocess_tag=False,
+ url=soup_url,
+ non_trans_tag=ast.literal_eval(
+ common_cfg["non_translateable_tags"],
+ ),
+ special_words=ast.literal_eval(common_cfg["special_words"]),
+ )
+
+ recursive_parser_debug.recursive_extract(
+ c_soup=recursive_parser_debug.soup,
+ c_special_character=recursive_parser_debug.special_character,
+ c_check_logo=recursive_parser_debug.check_logo,
+ c_check_footer=recursive_parser_debug.check_footer,
+ c_check_cert=recursive_parser_debug.check_cert,
+ c_trans=recursive_parser_debug.trans,
+ c_tag_a_url=recursive_parser_debug.tag_a_url,
+ )
+
+ # if not it means return the extracted text result
+ else:
+ # recursive write
+ write_file = []
+ write_file = recursive_write(
+ soup=html,
+ dump_list=write_file,
+ non_trans_tag=ast.literal_eval(
+ common_cfg["non_translateable_tags"],
+ ),
+ )
+ return "\n".join(write_file)
+
+
+def wrap(to_wrap, wrap_in):
+ contents = to_wrap.replace_with(wrap_in)
+ wrap_in.append(contents)
+
+
+def recursive_write(soup, dump_list, non_trans_tag):
+ """Recusrively extract html text without additional modification"""
+ if isinstance(soup, bs4.element.NavigableString):
+ text = str(soup)
+ if text != "" and text != "\n":
+ dump_list.append(text)
+ return dump_list
+ elif soup.get("class") is not None and soup.get("class") == ["container"]:
+ # should not add session for element of href
+ dump_list.append("")
+
+ if hasattr(soup, "contents"):
+ for content in soup.contents:
+ translateable = (
+ soup.name not in non_trans_tag
+ ) # and soup.get('translate') != "no"
+ if translateable:
+ if content == "html":
+ continue
+ dump_list = recursive_write(content, dump_list, non_trans_tag)
+ return dump_list
+
+
+def scrape_tt(save_filename, is_jp, original_url, com_cfg, dir_setup):
+ """Scrape Tokyo Techies URI
+ Args:
+ save_filename: name of the content file to be saved
+ is_jp: True for JA page, False for EN page
+ original_url: URI link
+ com_cfg: common config within config.cfg
+ dir_setup: instances if DirectorySetup to get folder to save content
+ """
+
+ save_fn_posfix = f"/{save_filename}.txt"
+
+ if is_jp:
+ jp_save_path = dir_setup.get_scraped_folder("ja") + save_fn_posfix
+ text_writer = TextWriter(jp_save_path, urls=original_url)
+ else:
+ en_save_path = dir_setup.get_scraped_folder("en") + save_fn_posfix
+ text_writer = TextWriter(en_save_path, urls=original_url)
+
+ lang = "Japanese" if is_jp else "English"
+
+ # logic to move /blog and /jobs to their parent pages
+ text_url = original_url
+ for parent_url in [
+ "https://www.tokyotechies.com/blog-ja", # ordering ja is important
+ "https://www.tokyotechies.com/jobs-ja",
+ "https://www.tokyotechies.com/blog",
+ "https://www.tokyotechies.com/jobs",
+ ]:
+ if original_url.startswith(parent_url):
+ text_url = parent_url
+ break
+
+ _orig_soup_title = BeautifulSoup(
+ requests.get(
+ original_url,
+ timeout=10,
+ ).content.decode("utf-8"),
+ "html.parser",
+ ).title.string
+ text_writer.write_txt("Title: " + _orig_soup_title)
+ text_writer.write_txt("URL: " + text_url)
+ text_writer.write_txt(f"Language: {lang}")
+ text_writer.write_txt("-----")
+
+ if com_cfg.getboolean("debug"):
+ if is_jp:
+ debug_jp_save_path = (
+ dir_setup.get_scraped_debug_folder("ja") + save_fn_posfix
+ )
+ debug_text_writer = TextWriter(
+ debug_jp_save_path,
+ urls=original_url,
+ )
+ else:
+ debug_en_save_path = (
+ dir_setup.get_scraped_debug_folder("en") + save_fn_posfix
+ )
+ debug_text_writer = TextWriter(
+ text_save_fn=debug_en_save_path,
+ urls=original_url,
+ )
+
+ debug_text_writer.write_txt("Title: " + _orig_soup_title)
+ debug_text_writer.write_txt("URL: " + text_url)
+ debug_text_writer.write_txt(f"Language: {lang}")
+ debug_text_writer.write_txt("-----")
+ else:
+ debug_text_writer = None
+
+ html_text_parser = HtmlTextParser(
+ urls=original_url,
+ text_wtr=text_writer,
+ is_remove_footer=True
+ if original_url
+ not in [
+ "https://www.tokyotechies.com",
+ "https://www.tokyotechies.com/ja",
+ ]
+ else False,
+ is_remove_header=True,
+ is_remove_comment=True,
+ is_remove_span=True,
+ debug_text_wtr=debug_text_writer,
+ common_cfg=com_cfg,
+ )
+ html_text_parser.recursive_extract_text_html()
+
+ # Start to post-processing on the txt file
+
+ if com_cfg.getboolean("debug") and com_cfg.getboolean(
+ "postprocessing_scrape",
+ ):
+ if is_jp:
+ postprocessing_save_path = (
+ dir_setup.get_scraped_postprocessing_folder_folder("ja")
+ + save_fn_posfix
+ )
+ else:
+ postprocessing_save_path = (
+ dir_setup.get_scraped_postprocessing_folder_folder("en")
+ + save_fn_posfix
+ )
+
+ # start process to merge header to the same line
+ merge_line_list = []
+ merge_line = []
+ p_head = None
+ header_set = {f"Header {i}" for i in range(1, 7)}
+
+ if is_jp:
+ with open(jp_save_path, encoding="utf-8") as f:
+ text_list = f.readlines()
+ else:
+ with open(en_save_path, encoding="utf-8") as f:
+ text_list = f.readlines()
+
+ for idx, texts in enumerate(text_list):
+ matched = False
+ for header_t in header_set:
+ if header_t in texts:
+ matched = True
+ if p_head is not None and p_head == header_t:
+ merge_line.append(p_head)
+ merge_line.append(idx)
+ break
+ else:
+ p_head = header_t
+ if merge_line:
+ merge_line_list.append(merge_line)
+ merge_line = []
+ if not matched:
+ p_head = None
+ if merge_line:
+ merge_line_list.append(merge_line)
+ merge_line = []
+
+ if merge_line:
+ merge_line_list.append(merge_line)
+
+ # reverse the update in order to make the original index correct
+ # when merging
+ merge_line_list = merge_line_list[::-1]
+ for merge_line in merge_line_list:
+ # check whether having to merge multiples line
+ if len(merge_line) > 3:
+ header_t, _s, _e = (
+ merge_line[0],
+ merge_line[1] - 1,
+ merge_line[-1],
+ )
+ # merge only 2 line
+ else:
+ header_t, _s, _e = (
+ merge_line[0],
+ merge_line[1] - 1,
+ merge_line[1],
+ )
+
+ merged_text = (
+ text_list[_s].replace(header_t + ": ", "").replace("\n", "")
+ )
+ for j in range(_s + 1, _e + 1):
+ merged_text = (
+ merged_text
+ + " "
+ + text_list[j]
+ .replace(header_t + ": ", "")
+ .replace("\n", "")
+ )
+
+ text_list = (
+ text_list[:_s]
+ + [header_t + ": " + merged_text + "\n"]
+ + text_list[_e + 1 :]
+ )
+
+ # end process to merge header to the same line
+
+ # start process to make format bulleting
+ buller_writer = BulletWriter()
+ header_list = sorted(list(header_set))
+
+ for idx, texts in enumerate(text_list):
+ for header_t in header_list:
+ if header_t in texts:
+ htype = header_list.index(header_t) + 1
+
+ # if htype == 1:
+ # buller_writer.increment_h1()
+ # buller_writer.remove_child_h1()
+ if htype == 2:
+ buller_writer.increment_h2()
+ buller_writer.remove_child_h2()
+ elif htype == 3:
+ buller_writer.increment_h3()
+ buller_writer.remove_child_h3()
+ elif htype == 4:
+ buller_writer.increment_h4()
+ buller_writer.remove_child_h4()
+ elif htype == 5:
+ buller_writer.increment_h5()
+ buller_writer.remove_child_h5()
+ elif htype == 6:
+ buller_writer.increment_h6()
+
+ text_list[idx] = text_list[idx].replace(
+ header_t + ": ",
+ buller_writer.format_bullet() + " ",
+ )
+ break
+
+ with open(postprocessing_save_path, "w") as outfile:
+ outfile.write("".join(text_list))
+
+ # end process to make format bulleting
+
+
+def setup_logger(debug_option=False):
+ """Setup logger modules"""
+ _logger = logging.getLogger(__name__)
+ _logger.setLevel(logging.DEBUG if debug_option else logging.INFO)
+
+ # Print to standard output if in debug mode
+ if debug_option:
+ # Stream handler
+ std_out_stream_handler = logging.StreamHandler(sys.stdout)
+ formatter = logging.Formatter("%(asctime)s %(levelname)s: %(message)s")
+ std_out_stream_handler.setFormatter(formatter)
+ _logger.addHandler(std_out_stream_handler)
+
+ # File handler
+ std_out_file_handler = logging.FileHandler(filename="debug.log")
+ std_out_file_handler.setLevel(logging.WARN)
+ std_out_file_handler.setFormatter(formatter)
+ _logger.addHandler(std_out_file_handler)
+
+ return _logger
+
+
+if __name__ == "__main__":
+ config_parser = configparser.ConfigParser()
+ config_parser.read("config.cfg")
+ common_config = config_parser["common"]
+
+ logger = setup_logger(common_config.getboolean("debug"))
+ directory_setup = DirectorySetup(common_config)
+
+ tt_url_scraper = TTUrlScraper(
+ bad_url=ast.literal_eval(common_config["bad_url"]),
+ common_cfg=common_config,
+ )
+ tt_url_json = tt_url_scraper.parse_uri()
+ # tt_url_json = {
+ # "8": {
+ # "url": "https://www.tokyotechies.com",
+ # "url_summary": "Homepage"
+ # },
+ # }
+
+ JP_TAG = [
+ "https://www.tokyotechies.com/ja",
+ "https://www.tokyotechies.com/jobs-ja",
+ "https://www.tokyotechies.com/blog-ja",
+ ]
+ CTA_PAGE = [
+ "https://www.tokyotechies.com/ja/about-us",
+ "https://www.tokyotechies.com/about-us",
+ ]
+ c = 0
+
+ for key, url_value in tqdm(tt_url_json.items()):
+ path_url = url_value.get("url")
+
+ # Skip all blog pages
+ if path_url.startswith("https://www.tokyotechies.com/blog"):
+ continue
+
+ is_jp_page = any([tag in path_url for tag in JP_TAG])
+
+ save_fn = (
+ path_url.replace("https://www.tokyotechies.com", "")
+ .replace("/", "_")
+ .replace("#", "_")
+ .replace("?", "_")
+ .replace(":", "_")
+ .replace("=", "_")
+ .replace(
+ "-",
+ "_",
+ )
+ )
+
+ # specific check for homepage
+ if not save_fn:
+ save_fn = "homepage"
+ c += 1
+ elif save_fn == "_ja":
+ save_fn = "ja_homepage"
+ c += 1
+ assert c < 3, "Wrong"
+
+ scrape_tt(
+ save_fn.lstrip("_"),
+ is_jp_page,
+ path_url,
+ common_config,
+ directory_setup,
+ )
+
+ # write version.txt file
+ version_writer = TTBotVersion()
+ version_filepath = (
+ directory_setup.get_scraped_postprocessing_folder_folder("en")
+ + "/_version.txt"
+ )
+ text_writer = TextWriter(version_filepath)
+ text_writer.write_txt("Title: Tokyo Techies: Tokyo Techies Bot version")
+ text_writer.write_txt("URL: https://www.tokyotechies.com")
+ text_writer.write_txt("Language: English")
+ text_writer.write_txt("-----")
+ text_writer.write_txt("Sub Section:")
+ text_writer.write_txt("Tokyo Techies Bot version")
+ text_writer.write_txt(version_writer.get_en_version())
+ text_writer.write_txt("")
+ text_writer.write_txt("Tokyo Techies ボットバージョン")
+ text_writer.write_txt(version_writer.get_jp_version(), newline=False)
+
+ # remove previous scraped folder except _appendix
+
+ training_folder = "../../data/TokyoTechies/training_files/"
+ model_folder = "../../models/TokyoTechies"
+ # Iterate through the list of files and remove them
+ for dirpath, dirnames, filenames in os.walk(training_folder):
+ for filename in filenames:
+ # skip _appendix
+ if filename == "_appendix.txt":
+ continue
+
+ file_path = os.path.join(dirpath, filename)
+ if os.path.isfile(file_path):
+ # remove with git for commit
+ subprocess.run(
+ ["git", "rm", file_path],
+ check=False,
+ stdout=subprocess.DEVNULL,
+ stderr=subprocess.STDOUT,
+ )
+
+ # copy version file to models folder
+ shutil.copy(version_filepath, model_folder)
+
+ # TODO: scrape for klever suite pages as well
+ # add scraped contents to folder for training
+ for lang in ast.literal_eval(common_config["website_lang"]):
+ shutil.copytree(
+ common_config["scraped_postprocessing_folder"] + f"/{lang}",
+ training_folder,
+ dirs_exist_ok=True,
+ )