Source code for wikidatasets.processFunctions

import bz2
import pickle
import os
import pandas as pd

from tqdm import tqdm
from wikidatasets.utils import get_results, clean
from wikidatasets.utils import get_pickle_path, write_to_pickle
from wikidatasets.utils import get_id, get_label, to_triplets, intersect, to_json
from wikidatasets.utils import concatpkls, write_csv, write_ent_dict, write_rel_dict, write_readme, relabel


[docs]def get_subclasses(subject):
    """Get a list of WikiData IDs of entities which are subclasses of the subject.

    Parameters
    ----------
    subject: str
        String describing the subject (e.g. 'Q5' for human).

    Returns
    -------
    result: list
        List of WikiData IDs of entities which are subclasses of the subject.

    """
    endpoint_url = "https://query.wikidata.org/sparql"
    query = """SELECT ?item WHERE {?item wdt:P279* wd:""" + subject + """ .}"""

    results = get_results(endpoint_url, query)

    return [clean(result['item']['value']) for result in results['results']['bindings']]


[docs]def query_wikidata_dump(dump_path, path, n_lines, test_entities=None, collect_labels=False):
    """This function goes through a Wikidata dump. It can either collect entities that are instances of \
    `test_entities` or collect the dictionary of labels. It can also do both.

    Parameters
    ----------
    dump_path: str
        Path to the latest-all.json.bz2 file downloaded from https://dumps.wikimedia.org/wikidatawiki/entities/.
    path: str
        Path to where pickle files will be written.
    n_lines: int
        Number of lines of the dump. Fastest way I found was `$ bzgrep -c ".*" latest-all.json.bz2`.
        This can be an upper-bound as it is only used for displaying a progress bar.
    test_entities: list
        List of entities to check if a line is instance of. For each line (entity), we check if it as a fact of the \
        type (id, query_rel, test_entity).
    collect_labels: bool
        Boolean indicating whether the labels dictionary should be collected.

    """
    pickle_path = get_pickle_path(path)
    collect_facts = (test_entities is not None)
    fails = []

    if collect_labels:
        labels = {}
    if collect_facts:
        facts = []
        n_pickle_dump = 0

    dump = bz2.open(dump_path, 'rt')
    progress_bar = tqdm(total=n_lines)
    counter = 0  # counter of the number of lines read
    line = dump.readline()  # the first line of the file should be "[\n" so we skip it

    while True:
        # while there are lines to read
        line = dump.readline().strip()
        if len(line) == 0:
            break

        counter += 1
        progress_bar.update(1)

        try:
            line = to_json(line)

            if collect_labels:
                id_ = get_id(line)
                labels[id_] = get_label(line)

            if collect_facts:
                triplets, instanceOf = to_triplets(line)
                if len(instanceOf) > 0 and intersect(instanceOf, test_entities):
                    facts.extend(triplets)

        except:
            if type(line) == dict and ('claims' in line.keys()):
                if len(line['claims']) != 0:
                    fails.append(line)
            else:
                fails.append(line)

        if counter % 3000000 == 0:
            # dump in pickle to free memory
            if collect_facts:
                n_pickle_dump += 1
                facts, fails = write_to_pickle(pickle_path, facts, fails, n_pickle_dump)

    if collect_facts:
        _, _ = write_to_pickle(pickle_path, facts, fails, n_pickle_dump + 1)
    if collect_labels:
        pickle.dump(labels, open(path + 'labels.pkl', 'wb'))


[docs]def build_dataset(path, labels, return_=False, dump_date='23rd April 2019'):
    """Builds datasets from the pickle files produced by the query_wikidata_dump.

    Parameters
    ----------
    path: str
        Path to the directory where there should already be a pickles/ directory. In the latter directory, all \
        the .pkl files will be concatenated into one dataset.
    labels: dict
        Dictionary collected by the query_wikidata_dump function when collect_labels is set to True.
    return_: bool
        Boolean indicating if the built dataset should be returned on top of being written on disk.
    dump_date: str
        String indicating the date of the Wikidata dump used. It is used in the readme of the dataset.

    Returns
    -------
    edges: pandas.DataFrame
        DataFrame containing the edges between entities of the graph.
    attributes: pandas.DataFrame
        DataFrame containing edges linking entities to their attributes.
    entities: pandas.DataFrame
        DataFrame containing a list of all entities & attributes with their Wikidata IDs and labels.
    relations: pandas.DataFrame
        DataFrame containing a list of all relations with their Wikidata IDs and labels.
    """

    if path[-1] != '/':
        path = path+'/'
    path_pickle = path + 'pickles/'
    n_files = len([name for name in os.listdir(path_pickle) if name[-4:] == '.pkl'])
    df = concatpkls(n_files, path_pickle)

    ents = list(df['headEntity'].unique())
    feats = list(set(df['tailEntity'].unique()) - set(ents))
    ent2ix = {ent: i for i, ent in enumerate(ents + feats)}
    ix2ent = {i: ent for ent, i in ent2ix.items()}

    tmp = df['relation'].unique()
    rel2ix = {rel: i for i, rel in enumerate(tmp)}
    ix2rel = {i: rel for rel, i in rel2ix.items()}

    df['headEntity'] = df['headEntity'].apply(lambda x: ent2ix[x])
    df['tailEntity'] = df['tailEntity'].apply(lambda x: ent2ix[x])
    df['relation'] = df['relation'].apply(lambda x: rel2ix[x])

    nodes = pd.DataFrame([[i, ix2ent[i]] for i in range(len(ents))],
                         columns=['entityID', 'wikidataID'])
    nodes['label'] = nodes['wikidataID'].apply(relabel, args=(labels,))

    entities = pd.DataFrame([[i, ix2ent[i]] for i in range(len(ix2ent))],
                            columns=['entityID', 'wikidataID'])
    entities['label'] = entities['wikidataID'].apply(relabel, args=(labels,))

    relations = pd.DataFrame([[i, ix2rel[i]] for i in range(len(ix2rel))],
                             columns=['relationID', 'wikidataID'])
    relations['label'] = relations['wikidataID'].apply(relabel, args=(labels,))

    edges_mask = df.tailEntity.isin(df['headEntity'].unique())
    edges = df.loc[edges_mask, ['headEntity', 'tailEntity', 'relation']]
    attributes = df.loc[~edges_mask, ['headEntity', 'tailEntity', 'relation']]

    write_csv(edges, path + 'edges.tsv')
    write_csv(attributes, path + 'attributes.tsv')
    write_ent_dict(nodes, path + 'nodes.tsv')
    write_ent_dict(entities, path + 'entities.tsv')
    write_rel_dict(relations, path + 'relations.tsv')
    write_readme(path+'readme.md',
                 n_core_ents=attributes['headEntity'].nunique(),
                 n_attrib_ents=attributes['tailEntity'].nunique(),
                 n_core_rels=edges['relation'].nunique(),
                 n_attrib_rels=attributes['relation'].nunique(),
                 n_core_facts=len(edges),
                 n_attrib_facts=len(attributes),
                 dump_date=dump_date)

    if return_:
        return edges, attributes, entities, relations