Source code for wikidatasets.utils

import pickle
import json
import pandas as pd
import os

from wikidatasets.exceptions import ParsingException
from tqdm import tqdm

from SPARQLWrapper import SPARQLWrapper, JSON


def get_results(endpoint_url, query):
    sparql = SPARQLWrapper(endpoint_url)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    return sparql.query().convert()


def to_json(line):
    if line[-1] == ',':
        line = line[:-1]  # all lines should end with a ','

    # turn string to json
    if line[0] != '{' or line[-1] != '}':
        # then this line is not a proper json file we should deal with it later
        raise ParsingException

    return json.loads(line)


def concat_claims(claims):
    """

    Parameters
    ----------
    claims: dict

    Returns
    -------
    iter: iterator
        iterator through the claims
    """

    for rel_id, rel_claims in claims.items():
        for claim in rel_claims:
            yield claim


def to_triplets(ent):
    """

    Parameters
    ----------
    ent: dict
        Dictionary coming from the parsing of a json line of the dump.

    Returns
    -------
    triplets: list
        List of triplets of this entity (head, rel, tail).
    """
    if len(ent['claims']) == 0:
        return []
    claims = concat_claims(ent['claims'])
    triplets = []
    instanceof = []
    e1 = ent['id']
    for claim in claims:
        mainsnak = claim['mainsnak']
        if mainsnak['snaktype'] != "value":
            continue
        if mainsnak['datatype'] == 'wikibase-item':
            rel = mainsnak['property']
            e2 = 'Q{}'.format(mainsnak['datavalue']['value']['numeric-id'])
            triplets.append((e1, rel, e2))
            if rel == 'P31':
                instanceof.append(e2)
    return triplets, instanceof


def get_type(ent):
    return ent['type']


def get_id(ent):
    return ent['id']


def get_label(ent):
    """

    Parameters
    ----------
    ent: dict
        Dictionary coming from the parsing of a json line of the dump.

    Returns
    -------
    label: str
        Label of ent in english if available of any other language else.
    """

    labels = ent['labels']
    if len(labels) == 0:
        return 'No label {}'.format(ent['id'])
    if 'en' in labels.keys():
        return labels['en']['value']
    else:
        return labels[list(labels.keys())[0]]['value']


def relabel(x, labels):
    try:
        lab = labels[x]
        if ':' in lab:
            return lab[lab.index(':')+1:]
        else:
            return lab
    except KeyError:
        return x


def clean(str_):
    if str_[:31] == 'http://www.wikidata.org/entity/':
        return str_[31:]
    else:
        print('problem')
        return ''


def get_pickle_path(path):
    if path[-1] != '/':
        path = path+'/'
    pickle_path = path + 'pickles/'
    if not os.path.exists(pickle_path):
        os.makedirs(pickle_path)
    return pickle_path


def write_to_pickle(pickle_path, facts, fails, n_pickle_dump):
    pickle.dump((facts, fails),
                open(pickle_path + 'dump{}.pkl'.format(n_pickle_dump), 'wb'))
    print('Just made pickle dump number {}'.format(n_pickle_dump))
    return [], []


def intersect(long_list, short_list):
    return len(set(long_list).intersection(set(short_list))) > 0


def count_true_fails(fails):
    true_fails = 0
    for f in fails:
        try:
            if str(f) == ']':
                continue  # in this case it's the last line of the original dump file
            if len(f['claims']) > 0:
                true_fails += 1
        except:
            print(f)
    return true_fails


def concatpkls(n_dump, path_pickle, labels=None):
    df = pd.DataFrame(columns=['headEntity', 'relation', 'tailEntity'])

    for nd in tqdm(range(n_dump)):
        with open(path_pickle + 'dump{}.pkl'.format(nd + 1), 'rb') as f:
            facts, fails = pickle.load(f)
            true_fails = count_true_fails(fails)
            if true_fails > 0:
                print('{} true fails'.format(true_fails))
        df = pd.concat([df, pd.DataFrame(facts, columns=['headEntity', 'relation', 'tailEntity'])])
    df = df.drop_duplicates()

    if labels is not None:
        df['headEntity'] = df['headEntity'].apply(relabel, args=(labels,))
        df['relation'] = df['relation'].apply(relabel, args=(labels,))
        df['tailEntity'] = df['tailEntity'].apply(relabel, args=(labels,))

    return df


def write_csv(df, name):
    with open(name, 'w', encoding='utf-8') as f:
        f.write('headEntity\ttailEntity\trelation\n')
        df.to_csv(f, sep='\t', header=False, index=False)


def write_ent_dict(df, name):
    with open(name, 'w', encoding='utf-8') as f:
        f.write('entityID\twikidataID\tlabel\n')
        df.to_csv(f, sep='\t', header=False, index=False)


def write_rel_dict(df, name):
    with open(name, 'w', encoding='utf-8') as f:
        f.write('relationID\twikidataID\tlabel\n')
        df.to_csv(f, sep='\t', header=False, index=False)


def write_readme(name, n_core_ents, n_attrib_ents,
                 n_core_rels, n_attrib_rels,
                 n_core_facts, n_attrib_facts,
                 dump_date):
    with open(name, 'w', encoding='utf-8') as f:
        f.write('Built with WikiData dump from {}.\n'.format(dump_date))
        f.write("Here are some meta data about this data set:\n")
        f.write("Core entities: {}\n".format(n_core_ents))
        f.write("Attribute entities: {}\n".format(n_attrib_ents))
        f.write("Core relations: {} (number of different relations involving only core entities)\n".format(n_core_rels))
        f.write("Attribute relations: {} (number of different relations from core entities "
                "to attribute entities)\n".format(n_attrib_rels))
        f.write("Core facts: {} (facts involving only core entities)\n".format(n_core_facts))
        f.write("Attribute facts : {} (facts linking core entities "
                "to their attribute entities)\n".format(n_attrib_facts))
        f.write("Find more details about this dataset at https://arxiv.org/abs/1906.04536.")


[docs]def load_data_labels(path, attributes=False, return_dicts=False):
    """This function loads the edges and attributes files into Pandas dataframes and merges the labels of entities and \
    relations to get.

    Parameters
    ----------
    path: str
        Path to the directory containing the edges.txt, attributes.txt, entities.txt, relations.txt files.
    attributes: bool
        Boolean indicating if we should read the attributes files. If False, then the edges file is read.
    return_dicts: bool
        Boolean indicating if the entities and relations labels dictionaries should be returned.

    Returns
    -------
    df: pandas.DataFrame
        DataFrame containing either the edges or the attributes depending on the value of `attributes`.
    entities: pandas.DataFrame
        DataFrame containing the list of all entities and wikidata IDs and labels.
    relations: pandas.DataFrame
        DataFrame containing the list of all relations and wikidata IDs and labels.
    """

    if attributes:
        df = pd.read_csv(path + 'attributes.tsv', sep='\t')
    else:
        df = pd.read_csv(path + 'edges.tsv', sep='\t')

    entities = pd.read_csv(path + 'entities.tsv', sep='\t')
    relations = pd.read_csv(path + 'relations.tsv', sep='\t')

    df = pd.merge(left=df, right=entities[['entityID', 'label']], left_on='headEntity',
                  right_on='entityID')
    df.drop('entityID', axis=1, inplace=True)
    df.columns = list(df.columns[:-1]) + ['headLabel']

    df = pd.merge(left=df, right=entities[['entityID', 'label']], left_on='tailEntity',
                  right_on='entityID')
    df.drop('entityID', axis=1, inplace=True)
    df.columns = list(df.columns[:-1]) + ['tailLabel']

    df = pd.merge(left=df, right=relations[['relationID', 'label']], left_on='relation',
                  right_on='relationID')
    df.drop('relationID', axis=1, inplace=True)
    df.columns = list(df.columns[:-1]) + ['relationLabel']

    if return_dicts:
        return df, entities, relations

    return df