matchbench.utils package¶

Submodules¶

matchbench.utils.column_feature module¶

matchbench.utils.column_feature.convert_string_lists_to_lists(data: DataFrame, labels: DataFrame)¶

matchbench.utils.column_feature.extract_bag_of_characters_features(data)¶

matchbench.utils.column_feature.extract_bag_of_words_features(data)¶

matchbench.utils.column_feature.extract_topic_features(lda, dic, long_threshold: int, numeric_rep: str, data)¶

matchbench.utils.column_feature.extract_word_embeddings_features(word_to_embedding, num_embeddings, values)¶

matchbench.utils.column_feature.infer_paragraph_embeddings_features(model, data)¶

matchbench.utils.column_feature.process_col(col, long_threshold, numeric_rep)¶

matchbench.utils.dataset module¶

matchbench.utils.dto module¶

matchbench.utils.dto.make_file(path)¶

matchbench.utils.dto.readobj(fname)¶

matchbench.utils.dto.save_array(arr, path, print_len=False, sort_by=None, descending=False, sep='\t', encoding='utf-8')¶

matchbench.utils.dto.save_map(mp, path, reverse_kv=False, sort_by_key=False, **kwargs)¶

matchbench.utils.dto.saveobj(obj, fname)¶

matchbench.utils.dto.to_json(obj)¶

matchbench.utils.emb_loader module¶

class matchbench.utils.emb_loader.BERT(model='bert-base-cased', pool='max')¶

Bases: object

bert_encode(sentences, layer=1)¶

pooled_bert_encode(sentences, layer=1)¶

pooled_encode_batched(sentences, batch_size=512, layer=1, save_gpu_memory=False)¶

to(device)¶

class matchbench.utils.emb_loader.EmbeddingLoader(model: str = 'xlm-roberta-base', device=device(type='cuda'), layer=1)¶

Bases: object

TR_Models = {'bert-base-cased': (<class 'transformers.models.bert.modeling_bert.BertModel'>, <class 'transformers.models.bert.tokenization_bert.BertTokenizer'>), 'bert-base-multilingual-cased': (<class 'transformers.models.bert.modeling_bert.BertModel'>, <class 'transformers.models.bert.tokenization_bert.BertTokenizer'>), 'bert-base-multilingual-uncased': (<class 'transformers.models.bert.modeling_bert.BertModel'>, <class 'transformers.models.bert.tokenization_bert.BertTokenizer'>), 'bert-base-uncased': (<class 'transformers.models.bert.modeling_bert.BertModel'>, <class 'transformers.models.bert.tokenization_bert.BertTokenizer'>), 'roberta-base': (<class 'transformers.models.roberta.modeling_roberta.RobertaModel'>, <class 'transformers.models.roberta.tokenization_roberta.RobertaTokenizer'>), 'xlm-mlm-100-1280': (<class 'transformers.models.xlm.modeling_xlm.XLMModel'>, <class 'transformers.models.xlm.tokenization_xlm.XLMTokenizer'>), 'xlm-roberta-base': (<class 'transformers.models.xlm_roberta.modeling_xlm_roberta.XLMRobertaModel'>, <class 'transformers.utils.dummy_sentencepiece_objects.XLMRobertaTokenizer'>), 'xlm-roberta-large': (<class 'transformers.models.xlm_roberta.modeling_xlm_roberta.XLMRobertaModel'>, <class 'transformers.utils.dummy_sentencepiece_objects.XLMRobertaTokenizer'>)}¶

get_embed_list(sent_batch: List[List[str]], get_length=False) → Union[Tensor, Tuple[Tensor, Tensor]]¶

static get_tokenizer(model: str, *args, **kwargs)¶

static pretrained_models(model)¶

matchbench.utils.emb_loader.average_embeds_over_words(bpe_vectors: ndarray, word_tokens_pair: List[List[str]]) → List[array]¶

matchbench.utils.emb_loader.minus_mask(inputs, input_lens, mask_type='max')¶

matchbench.utils.eval module¶

matchbench.utils.eval.bi_csls_matrix(sim_matrix0, sim_matrix1, k=10, return2=True) → Union[Tensor, Tuple[Tensor, Tensor]]¶

matchbench.utils.eval.csls_impl(sim_matrix, dist0, dist1) → Tensor¶

matchbench.utils.eval.evaluate_embeds(src_emb, trg_emb, link, mapping=None, no_csls=True, rev=False, mrr=False)¶

matchbench.utils.eval.evaluate_sim_matrix(link, sim_x2y, sim_y2x=None, ignore=(None, None), start='\t', no_csls=True, mrr=False)¶

matchbench.utils.eval.get_cos_sim(src_emb: Tensor, trg_emb: Tensor, k_ent=10) → Tuple[Tensor, Tensor, Tensor]¶

matchbench.utils.eval.get_csls_sim(sim_matrix: Tensor, dist0: Tensor, dist1: Tensor) → Tuple[Tensor, Tensor, Tensor]¶

matchbench.utils.eval.get_hit_k(match_id: Tensor, link: Tensor, src=0, k_list=(1, 3, 5, 10), ignore=None, start='')¶

matchbench.utils.eval.get_mrr(link: Tensor, sim_matrix: Tensor, which=0, batch_size=4096, start='\t')¶

matchbench.utils.eval.get_topk_sim(sim: Tensor, k_ent=10) → Tuple[Tensor, Tensor, Tensor]¶

matchbench.utils.eval.sparse_acc(sp_sim: Tensor, link: Tensor, device='cpu')¶

matchbench.utils.eval.sparse_top_k(sp_sim: Tensor, link: Tensor, device='cuda', needed=(1, 5, 50), batch_size=512)¶

matchbench.utils.eval.truncated_mrr(topks: Tensor, link: Tensor, fail=None)¶

matchbench.utils.file_tools module¶

matchbench.utils.file_tools.att_file_make(keep_data, remove_data, keep_file_name, remove_file_name)¶: Save entity alignment attributes.

matchbench.utils.fuse module¶

matchbench.utils.fuse.naive_sim_fuser(sims, param=None, device='cuda')¶

matchbench.utils.fuse.subscribe(x, i)¶

matchbench.utils.load module¶

matchbench.utils.load.load_model(model: <module 'torch.nn.modules' from 'C:\\env\\conda\\envs\\matchbench\\lib\\site-packages\\torch\\nn\\modules\\__init__.py'>, path: str)¶

matchbench.utils.load.save_model(model: <module 'torch.nn.modules' from 'C:\\env\\conda\\envs\\matchbench\\lib\\site-packages\\torch\\nn\\modules\\__init__.py'>, path: str)¶

matchbench.utils.load.set_seed(seed)¶: Set random seed.

matchbench.utils.nxmetis module¶

matchbench.utils.partition module¶

matchbench.utils.sampler module¶

matchbench.utils.text_sim module¶

matchbench.utils.text_sim.approximate_sim(src: Tensor, mapping: Tensor, trg: Tensor, rank=1000, niter=2, keep_k=100, batch_size=5000)¶

matchbench.utils.text_sim.calc_topk_sim(xs, xt, k=1, which=0, batch_size=2048, split=False, lazy=False)¶

matchbench.utils.text_sim.get_bert_maxpooling_embs(ent1: Dict[str, int], ent2: Dict[str, int], encode_batch_sz=2048, model='bert-base-multilingual-cased', device='cuda')¶

matchbench.utils.text_sim.get_ent_token_info(ent1: Dict[str, int], ent2: Dict[str, int], device='cuda', save_prefix='ei_', **kwargs) → Tuple[EntTokenInfo, EntTokenInfo]¶

matchbench.utils.text_sim.lazy_topk(xs, xt, k=1, req='both') → Union[Tuple[Tensor, Tensor], Tensor]¶

matchbench.utils.text_sim.makeset(ent_list, num_perm)¶

matchbench.utils.text_sim.matrix_sinkhorn(pred_or_m, expected=None, a=None, b=None)¶

matchbench.utils.text_sim.minhash_select_pairs(e1: Iterable[MinHash], e2: Iterable[str], begin_with=0, threshold=0.5, num_perm=128, redis_port=6138)¶

matchbench.utils.text_sim.sinkhorn_process(M: Tensor)¶

matchbench.utils.text_sim.sparse_semantic_sim(e1info: EntTokenInfo, e2info: EntTokenInfo, device: device = 'cuda', filter_token_cnt=None) → Tensor¶

matchbench.utils.text_sim.sparse_string_sim(ent1, ent2, batch_size=1000000, num_perm=128, *args, **kwargs) → Tensor¶

matchbench.utils.text_sim.token_level_similarity(src_w2e: Tensor, trg_w2e: Tensor, src_word_x: Tensor, trg_word_x: Tensor, sparse_k=1, dense_mm=False, do_sinkhorn=False)¶

matchbench.utils.text_sim.union(mp, sa, sb, hf, now)¶

matchbench.utils.text_utils module¶

class matchbench.utils.text_utils.EntTokenInfo(ents, words, emb, w2e, e2w)¶

Bases: object

e2w: List[List[int]]¶

emb: Tensor¶

ent_cnt()¶

ents: Dict[str, int]¶

filter_tokens(k=25, verbose=False)¶

get_tf_idf(filter_eps=None, filter_tokens=None)¶

static load(path, *args, **kwargs)¶

save(path, *args, **kwargs)¶

static static_high_freq_words(w2e, word_list, k=25, verbose=False)¶

static static_punc_tokens(word_list, punc=None, verbose=False)¶

w2e: List[Set[int]]¶

word_cnt()¶

words: List[str]¶

matchbench.utils.text_utils.cpm_embedding(ent2word, words, cpm_types, models=('en', 'fr'))¶

matchbench.utils.text_utils.edit_dist_of(sent0, sent1, item)¶

matchbench.utils.text_utils.embed_word2entity(ent2word, word_emb, reduction='max') → Tensor¶

matchbench.utils.text_utils.faiss_search_impl(emb_q, emb_id, emb_size, shift, k=50, search_batch_sz=50000, gpu=True)¶

matchbench.utils.text_utils.gen_mean(vals, p)¶

matchbench.utils.text_utils.get_batch_sim(embed, topk=50, split=True)¶

matchbench.utils.text_utils.get_count(words, ent_lists, binary=True)¶

matchbench.utils.text_utils.get_fasttext_aligned_vectors(words, device, lang)¶

matchbench.utils.text_utils.get_name_feature_map(sents, embedding_loader=None, device='cuda', batch_size=1024, use_fasttext=False, lang=None, **kwargs)¶

matchbench.utils.text_utils.get_punctuations()¶

matchbench.utils.text_utils.get_tf_idf(words, ent_lists, bert_tokenizer=None)¶

matchbench.utils.text_utils.global_level_semantic_sim(embs, k=50, search_batch_sz=50000, index_batch_sz=500000, split=False, norm=True, gpu=True)¶

matchbench.utils.text_utils.normalize_vectors(embeddings, center=False)¶

matchbench.utils.text_utils.pairwise_edit_distance(sent0, sent1, to_tensor=True)¶

matchbench.utils.text_utils.reduce(tensor, reduction='mean', dim=0)¶

matchbench.utils.text_utils.remove_prefix_to_list(entity_dict: {}, prefix='http(s)?://[a-z\\.]+/[^/]+/', punc='') → []¶

matchbench.utils.text_utils.remove_punc(str, punc=None)¶

matchbench.utils.text_utils.selected_edit_distance(sent0, sent1, needed, batch_size=100000)¶

matchbench.utils.text_utils.tokenize(sent, tokenizer)¶

matchbench.utils.utils module¶

matchbench.utils.utils.add_cnt_for(mp, val, begin=None)¶

Parameters:

mp – dict, mp[val: int]=id: int
val – old_id
begin – cur_id

matchbench.utils.utils.add_logs(k, v)¶

matchbench.utils.utils.apply(func, *args)¶

matchbench.utils.utils.apply_on_sparse(func, tensor)¶

matchbench.utils.utils.argprint(**kwargs)¶

matchbench.utils.utils.batch_spspmm(a, b, batch_size=1000, verbose=10, filter_softmax=0.01)¶

matchbench.utils.utils.cosine_distance(x1, x2, eps=1e-08)¶

matchbench.utils.utils.cosine_sim(x1, x2=None, eps=1e-08)¶

matchbench.utils.utils.dense_to_sparse(x)¶

matchbench.utils.utils.dict_values_to_tensor(d: {}, device='cuda')¶

matchbench.utils.utils.filter_which(x: Tensor, **kwargs)¶

matchbench.utils.utils.get_iv(sps: List[Tensor])¶

matchbench.utils.utils.has_key(mp, k)¶

matchbench.utils.utils.ind2sparse(indices: Tensor, size, size2=None, dtype=torch.float32, values=None)¶

matchbench.utils.utils.lst_argmax(lst: List[Any], min=False)¶

matchbench.utils.utils.masked_minmax(a: Tensor, eps=1e-08, masked_val=0.0, in_place=True)¶

matchbench.utils.utils.matrix_argmax(tensor: Tensor, dim=1)¶

matchbench.utils.utils.matrix_argmin(tensor: Tensor, dim=1)¶

matchbench.utils.utils.minmax(a: Tensor, dim=-1, eps=1e-08, in_place=True) → Tensor¶

matchbench.utils.utils.mp2list(mp, assoc=None)¶

matchbench.utils.utils.norm_embed(embed: Tensor) → Tensor¶

matchbench.utils.utils.norm_process(embed: Tensor, eps=1e-05) → Tensor¶

matchbench.utils.utils.orthogonal_projection(W: Tensor) → Tensor¶

matchbench.utils.utils.print_size(*args, **kwargs)¶

matchbench.utils.utils.procrustes(emb1, emb2, link0, link1)¶

matchbench.utils.utils.random_split(y: Tensor, total=15000, cnt_test=9000, cnt_train=4500, dim=1, device='cuda')¶

matchbench.utils.utils.rdpm(total, cnt)¶

matchbench.utils.utils.rebuild_with_indices(sp: Tensor)¶

matchbench.utils.utils.remain_topk_sim(matrix: Tensor, dim=0, k=1500, split=False)¶

matchbench.utils.utils.resize_sparse(x: Tensor, new_size, ind_shift)¶

matchbench.utils.utils.save_similarity_matrix(sparse=False, **kwargs)¶

matchbench.utils.utils.save_vectors(fname, x, words)¶

matchbench.utils.utils.scatter_op(tensor: Tensor, op='sum', dim=-1, dim_size=None)¶

matchbench.utils.utils.seperate_index_type(graph)¶

matchbench.utils.utils.set_seed(seed)¶

matchbench.utils.utils.sparse_argmax(tensor, scatter_dim, dim=0)¶

matchbench.utils.utils.sparse_argmin(tensor, scatter_dim, dim=0)¶

matchbench.utils.utils.sparse_dense_element_wise_op(sparse: ~torch.Tensor, dense: ~torch.Tensor, op=<built-in method mul of type object>)¶

matchbench.utils.utils.sparse_max(tensor: Tensor, dim=-1)¶

matchbench.utils.utils.sparse_min(tensor: Tensor, dim=-1)¶

matchbench.utils.utils.sparse_minmax(a: Tensor, eps=1e-08, in_place=True) → Tensor¶

matchbench.utils.utils.sparse_softmax(x: Tensor, dim=0)¶

matchbench.utils.utils.split_sp(sp: Tensor)¶

matchbench.utils.utils.spmm(s: Tensor, d: Tensor) → Tensor¶

matchbench.utils.utils.spmm_ds(d: Tensor, s: Tensor) → Tensor¶

matchbench.utils.utils.spmm_sd(s: Tensor, d: Tensor) → Tensor¶

matchbench.utils.utils.spspmm(a, b, separate=False)¶

matchbench.utils.utils.to_dense(x)¶

matchbench.utils.utils.to_tensor(device, dtype, *args)¶

matchbench.utils.utils.to_torch_sparse(matrix, dtype=<class 'float'>, device='cuda')¶

matchbench.utils.utils.topk2spmat(val0, ind0, size, dim=0, device: device = 'cuda', split=False)¶

matchbench.utils.utils.update_time_logs(action: str)¶

matchbench.utils.utils.view2(x)¶

matchbench.utils.utils.view3(x: Tensor) → Tensor¶

matchbench.utils.utils.view_back(M)¶

matchbench.utils.utils.z_score(embed)¶

matchbench.utils package¶

Submodules¶

matchbench.utils.column_feature module¶

matchbench.utils.dataset module¶

matchbench.utils.dto module¶

matchbench.utils.emb_loader module¶

matchbench.utils.eval module¶

matchbench.utils.file_tools module¶

matchbench.utils.fuse module¶

matchbench.utils.load module¶

matchbench.utils.nxmetis module¶

matchbench.utils.partition module¶

matchbench.utils.sampler module¶

matchbench.utils.text_sim module¶

matchbench.utils.text_utils module¶

matchbench.utils.utils module¶

Module contents¶