File size: 861 Bytes
9577e45
 
 
 
 
 
 
 
770d2f5
9577e45
 
770d2f5
9577e45
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
"""This module contains utility functions for the project"""

import mmh3
from haystack import Document


def get_unique_docs(dataset, unique_docs: set):
    """Get unique documents from dataset



    Args:

    dataset: list of dictionaries



    Returns:

    docs: list of haystack.Document

    """
    docs = list()
    for doc in dataset:
        if doc["context"] is not None and doc["context_id"] not in unique_docs:
            unique_docs.add(doc["context_id"])
            document = Document(
                content=doc["context"],
                meta={
                    "title": doc["context_title"],
                    "context_id": doc["context_id"],
                    "url": doc["url"],
                    "source": "QASports",
                },
            )
            docs.append(document)
    return docs