meta_keys = [
    """
    - dup_signals: relevant information related to dedup metrics
    - source: url to download
    - file: id to identify PMC files
    - Corpus id: id to identify corpus
    - Openaccessinfo: License/doi information
    - Pmid: pubmed id
    - Title: title of the paper, mainly for philpapers dataset
    - Type: type of data, mainly for philpapers dataset
    - Creator: creator of the data, mainly for philpapers dataset
    - Subject: subject of the data, mainly for philpapers dataset
    - Date: date of the data, mainly for philpapers dataset
    - Identifier: identifier of the data, mainly for philpapers dataset
    - Description: description of the data, mainly for philpapers dataset
    - Datestamp: datestamp of the data, mainly for philpapers dataset
    """, #papers
    """
    - dup_signals: relevant information related to dedup metrics
    - url: wikipedia link
    - title: title of the article
    - language: language of the article
    """, #wikipedia
    """
    - dup_signals: relevant information related to dedup metrics
    - source: which stackexchange it is coming from, example: pt.stackoverflow
    - answer_score: list of scores of answer in same order as answer in text
    - comment_score: list of scores of comment in same order as comment in text
    """, #Stackexchange
    """
    - dup_signals: relevant information related to dedup metrics
    - language: language of the text
    """, #europarl
    """
    - dup_signals: relevant information related to dedup metrics
    - Channel: channel of the irc
    - month: month of the post
    """, #ubuntu irc
    """
    - dup_signals: relevant information related to dedup metrics
    - Id: Unique indetifier of the post
    """, #hackernews
    """
    - dup_signals: relevant information related to dedup metrics
    - short_book_title: short title of the book
    - publication_date: publication date of the book
    - url: url of the book
    """, #pg19
    """
    - dup_signals: relevant information related to dedup metrics
    - bibliographic_information: bibliographic information, may contain title
    - source_file: link of the source file
    - Abstract: abstract of the document
    - Citations: list of citations
    - Assignees: details about inventors
    - Classification: classification of the document
    - Inventors: details about inventors
    """, #uspto
    """
    - dup_signals: relevant information related to dedup metrics
    """, #freelaw
    """
    - dup_signals: relevant information related to dedup metrics
    - source: field of maths
    """, #dmmaths
]

bucket_info = [
    """
    """, #papers
    """
    """, #wikipedia
    """
    """, #Stackexchange
    """
    """, #europarl
    """
    """, #ubuntu irc
    """
    """, #hackernews
    """
    """, #pg19
    """
    """, #uspto
    """
    """, #freelaw
    """
    """, #dmmaths
]