Source code for wikidatasets.processFunctions

import bz2
import pickle
import os
import pandas as pd

from tqdm import tqdm
from wikidatasets.utils import get_results, clean
from wikidatasets.utils import get_pickle_path, write_to_pickle
from wikidatasets.utils import get_id, get_label, to_triplets, intersect, to_json
from wikidatasets.utils import concatpkls, write_csv, write_ent_dict, write_rel_dict, write_readme, relabel

[docs]def get_subclasses(subject): """Get a list of WikiData IDs of entities which are subclasses of the subject. Parameters ---------- subject: str String describing the subject (e.g. 'Q5' for human). Returns ------- result: list List of WikiData IDs of entities which are subclasses of the subject. """ endpoint_url = "" query = """SELECT ?item WHERE {?item wdt:P279* wd:""" + subject + """ .}""" results = get_results(endpoint_url, query) return [clean(result['item']['value']) for result in results['results']['bindings']]
[docs]def query_wikidata_dump(dump_path, path, n_lines, test_entities=None, collect_labels=False): """This function goes through a Wikidata dump. It can either collect entities that are instances of \ `test_entities` or collect the dictionary of labels. It can also do both. Parameters ---------- dump_path: str Path to the latest-all.json.bz2 file downloaded from path: str Path to where pickle files will be written. n_lines: int Number of lines of the dump. Fastest way I found was `$ bzgrep -c ".*" latest-all.json.bz2`. This can be an upper-bound as it is only used for displaying a progress bar. test_entities: list List of entities to check if a line is instance of. For each line (entity), we check if it as a fact of the \ type (id, query_rel, test_entity). collect_labels: bool Boolean indicating whether the labels dictionary should be collected. """ pickle_path = get_pickle_path(path) collect_facts = (test_entities is not None) fails = [] if collect_labels: labels = {} if collect_facts: facts = [] n_pickle_dump = 0 dump =, 'rt') progress_bar = tqdm(total=n_lines) counter = 0 # counter of the number of lines read line = dump.readline() # the first line of the file should be "[\n" so we skip it while True: # while there are lines to read line = dump.readline().strip() if len(line) == 0: break counter += 1 progress_bar.update(1) try: line = to_json(line) if collect_labels: id_ = get_id(line) labels[id_] = get_label(line) if collect_facts: triplets, instanceOf = to_triplets(line) if len(instanceOf) > 0 and intersect(instanceOf, test_entities): facts.extend(triplets) except: if type(line) == dict and ('claims' in line.keys()): if len(line['claims']) != 0: fails.append(line) else: fails.append(line) if counter % 3000000 == 0: # dump in pickle to free memory if collect_facts: n_pickle_dump += 1 facts, fails = write_to_pickle(pickle_path, facts, fails, n_pickle_dump) if collect_facts: _, _ = write_to_pickle(pickle_path, facts, fails, n_pickle_dump + 1) if collect_labels: pickle.dump(labels, open(path + 'labels.pkl', 'wb'))
[docs]def build_dataset(path, labels, return_=False, dump_date='23rd April 2019'): """Builds datasets from the pickle files produced by the query_wikidata_dump. Parameters ---------- path: str Path to the directory where there should already be a pickles/ directory. In the latter directory, all \ the .pkl files will be concatenated into one dataset. labels: dict Dictionary collected by the query_wikidata_dump function when collect_labels is set to True. return_: bool Boolean indicating if the built dataset should be returned on top of being written on disk. dump_date: str String indicating the date of the Wikidata dump used. It is used in the readme of the dataset. Returns ------- edges: pandas.DataFrame DataFrame containing the edges between entities of the graph. attributes: pandas.DataFrame DataFrame containing edges linking entities to their attributes. entities: pandas.DataFrame DataFrame containing a list of all entities & attributes with their Wikidata IDs and labels. relations: pandas.DataFrame DataFrame containing a list of all relations with their Wikidata IDs and labels. """ if path[-1] != '/': path = path+'/' path_pickle = path + 'pickles/' n_files = len([name for name in os.listdir(path_pickle) if name[-4:] == '.pkl']) df = concatpkls(n_files, path_pickle) ents = list(df['headEntity'].unique()) feats = list(set(df['tailEntity'].unique()) - set(ents)) ent2ix = {ent: i for i, ent in enumerate(ents + feats)} ix2ent = {i: ent for ent, i in ent2ix.items()} tmp = df['relation'].unique() rel2ix = {rel: i for i, rel in enumerate(tmp)} ix2rel = {i: rel for rel, i in rel2ix.items()} df['headEntity'] = df['headEntity'].apply(lambda x: ent2ix[x]) df['tailEntity'] = df['tailEntity'].apply(lambda x: ent2ix[x]) df['relation'] = df['relation'].apply(lambda x: rel2ix[x]) nodes = pd.DataFrame([[i, ix2ent[i]] for i in range(len(ents))], columns=['entityID', 'wikidataID']) nodes['label'] = nodes['wikidataID'].apply(relabel, args=(labels,)) entities = pd.DataFrame([[i, ix2ent[i]] for i in range(len(ix2ent))], columns=['entityID', 'wikidataID']) entities['label'] = entities['wikidataID'].apply(relabel, args=(labels,)) relations = pd.DataFrame([[i, ix2rel[i]] for i in range(len(ix2rel))], columns=['relationID', 'wikidataID']) relations['label'] = relations['wikidataID'].apply(relabel, args=(labels,)) edges_mask = df.tailEntity.isin(df['headEntity'].unique()) edges = df.loc[edges_mask, ['headEntity', 'tailEntity', 'relation']] attributes = df.loc[~edges_mask, ['headEntity', 'tailEntity', 'relation']] write_csv(edges, path + 'edges.tsv') write_csv(attributes, path + 'attributes.tsv') write_ent_dict(nodes, path + 'nodes.tsv') write_ent_dict(entities, path + 'entities.tsv') write_rel_dict(relations, path + 'relations.tsv') write_readme(path+'', n_core_ents=attributes['headEntity'].nunique(), n_attrib_ents=attributes['tailEntity'].nunique(), n_core_rels=edges['relation'].nunique(), n_attrib_rels=attributes['relation'].nunique(), n_core_facts=len(edges), n_attrib_facts=len(attributes), dump_date=dump_date) if return_: return edges, attributes, entities, relations