matchbench.utils package

Submodules

matchbench.utils.column_feature module

matchbench.utils.column_feature.convert_string_lists_to_lists(data: DataFrame, labels: DataFrame)
matchbench.utils.column_feature.extract_bag_of_characters_features(data)
matchbench.utils.column_feature.extract_bag_of_words_features(data)
matchbench.utils.column_feature.extract_topic_features(lda, dic, long_threshold: int, numeric_rep: str, data)
matchbench.utils.column_feature.extract_word_embeddings_features(word_to_embedding, num_embeddings, values)
matchbench.utils.column_feature.infer_paragraph_embeddings_features(model, data)
matchbench.utils.column_feature.process_col(col, long_threshold, numeric_rep)

matchbench.utils.dataset module

matchbench.utils.dto module

matchbench.utils.dto.make_file(path)
matchbench.utils.dto.readobj(fname)
matchbench.utils.dto.save_array(arr, path, print_len=False, sort_by=None, descending=False, sep='\t', encoding='utf-8')
matchbench.utils.dto.save_map(mp, path, reverse_kv=False, sort_by_key=False, **kwargs)
matchbench.utils.dto.saveobj(obj, fname)
matchbench.utils.dto.to_json(obj)

matchbench.utils.emb_loader module

class matchbench.utils.emb_loader.BERT(model='bert-base-cased', pool='max')

Bases: object

bert_encode(sentences, layer=1)
pooled_bert_encode(sentences, layer=1)
pooled_encode_batched(sentences, batch_size=512, layer=1, save_gpu_memory=False)
to(device)
class matchbench.utils.emb_loader.EmbeddingLoader(model: str = 'xlm-roberta-base', device=device(type='cuda'), layer=1)

Bases: object

TR_Models = {'bert-base-cased': (<class 'transformers.models.bert.modeling_bert.BertModel'>, <class 'transformers.models.bert.tokenization_bert.BertTokenizer'>), 'bert-base-multilingual-cased': (<class 'transformers.models.bert.modeling_bert.BertModel'>, <class 'transformers.models.bert.tokenization_bert.BertTokenizer'>), 'bert-base-multilingual-uncased': (<class 'transformers.models.bert.modeling_bert.BertModel'>, <class 'transformers.models.bert.tokenization_bert.BertTokenizer'>), 'bert-base-uncased': (<class 'transformers.models.bert.modeling_bert.BertModel'>, <class 'transformers.models.bert.tokenization_bert.BertTokenizer'>), 'roberta-base': (<class 'transformers.models.roberta.modeling_roberta.RobertaModel'>, <class 'transformers.models.roberta.tokenization_roberta.RobertaTokenizer'>), 'xlm-mlm-100-1280': (<class 'transformers.models.xlm.modeling_xlm.XLMModel'>, <class 'transformers.models.xlm.tokenization_xlm.XLMTokenizer'>), 'xlm-roberta-base': (<class 'transformers.models.xlm_roberta.modeling_xlm_roberta.XLMRobertaModel'>, <class 'transformers.utils.dummy_sentencepiece_objects.XLMRobertaTokenizer'>), 'xlm-roberta-large': (<class 'transformers.models.xlm_roberta.modeling_xlm_roberta.XLMRobertaModel'>, <class 'transformers.utils.dummy_sentencepiece_objects.XLMRobertaTokenizer'>)}
get_embed_list(sent_batch: List[List[str]], get_length=False) Union[Tensor, Tuple[Tensor, Tensor]]
static get_tokenizer(model: str, *args, **kwargs)
static pretrained_models(model)
matchbench.utils.emb_loader.average_embeds_over_words(bpe_vectors: ndarray, word_tokens_pair: List[List[str]]) List[array]
matchbench.utils.emb_loader.minus_mask(inputs, input_lens, mask_type='max')

matchbench.utils.eval module

matchbench.utils.eval.bi_csls_matrix(sim_matrix0, sim_matrix1, k=10, return2=True) Union[Tensor, Tuple[Tensor, Tensor]]
matchbench.utils.eval.csls_impl(sim_matrix, dist0, dist1) Tensor
matchbench.utils.eval.evaluate_embeds(src_emb, trg_emb, link, mapping=None, no_csls=True, rev=False, mrr=False)
matchbench.utils.eval.evaluate_sim_matrix(link, sim_x2y, sim_y2x=None, ignore=(None, None), start='\t', no_csls=True, mrr=False)
matchbench.utils.eval.get_cos_sim(src_emb: Tensor, trg_emb: Tensor, k_ent=10) Tuple[Tensor, Tensor, Tensor]
matchbench.utils.eval.get_csls_sim(sim_matrix: Tensor, dist0: Tensor, dist1: Tensor) Tuple[Tensor, Tensor, Tensor]
matchbench.utils.eval.get_hit_k(match_id: Tensor, link: Tensor, src=0, k_list=(1, 3, 5, 10), ignore=None, start='')
matchbench.utils.eval.get_mrr(link: Tensor, sim_matrix: Tensor, which=0, batch_size=4096, start='\t')
matchbench.utils.eval.get_topk_sim(sim: Tensor, k_ent=10) Tuple[Tensor, Tensor, Tensor]
matchbench.utils.eval.sparse_acc(sp_sim: Tensor, link: Tensor, device='cpu')
matchbench.utils.eval.sparse_top_k(sp_sim: Tensor, link: Tensor, device='cuda', needed=(1, 5, 50), batch_size=512)
matchbench.utils.eval.truncated_mrr(topks: Tensor, link: Tensor, fail=None)

matchbench.utils.file_tools module

matchbench.utils.file_tools.att_file_make(keep_data, remove_data, keep_file_name, remove_file_name)

Save entity alignment attributes.

matchbench.utils.fuse module

matchbench.utils.fuse.naive_sim_fuser(sims, param=None, device='cuda')
matchbench.utils.fuse.subscribe(x, i)

matchbench.utils.load module

matchbench.utils.load.load_model(model: <module 'torch.nn.modules' from 'C:\\env\\conda\\envs\\matchbench\\lib\\site-packages\\torch\\nn\\modules\\__init__.py'>, path: str)
matchbench.utils.load.save_model(model: <module 'torch.nn.modules' from 'C:\\env\\conda\\envs\\matchbench\\lib\\site-packages\\torch\\nn\\modules\\__init__.py'>, path: str)
matchbench.utils.load.set_seed(seed)

Set random seed.

matchbench.utils.nxmetis module

matchbench.utils.partition module

matchbench.utils.sampler module

matchbench.utils.text_sim module

matchbench.utils.text_sim.approximate_sim(src: Tensor, mapping: Tensor, trg: Tensor, rank=1000, niter=2, keep_k=100, batch_size=5000)
matchbench.utils.text_sim.calc_topk_sim(xs, xt, k=1, which=0, batch_size=2048, split=False, lazy=False)
matchbench.utils.text_sim.get_bert_maxpooling_embs(ent1: Dict[str, int], ent2: Dict[str, int], encode_batch_sz=2048, model='bert-base-multilingual-cased', device='cuda')
matchbench.utils.text_sim.get_ent_token_info(ent1: Dict[str, int], ent2: Dict[str, int], device='cuda', save_prefix='ei_', **kwargs) Tuple[EntTokenInfo, EntTokenInfo]
matchbench.utils.text_sim.lazy_topk(xs, xt, k=1, req='both') Union[Tuple[Tensor, Tensor], Tensor]
matchbench.utils.text_sim.makeset(ent_list, num_perm)
matchbench.utils.text_sim.matrix_sinkhorn(pred_or_m, expected=None, a=None, b=None)
matchbench.utils.text_sim.minhash_select_pairs(e1: Iterable[MinHash], e2: Iterable[str], begin_with=0, threshold=0.5, num_perm=128, redis_port=6138)
matchbench.utils.text_sim.sinkhorn_process(M: Tensor)
matchbench.utils.text_sim.sparse_semantic_sim(e1info: EntTokenInfo, e2info: EntTokenInfo, device: device = 'cuda', filter_token_cnt=None) Tensor
matchbench.utils.text_sim.sparse_string_sim(ent1, ent2, batch_size=1000000, num_perm=128, *args, **kwargs) Tensor
matchbench.utils.text_sim.token_level_similarity(src_w2e: Tensor, trg_w2e: Tensor, src_word_x: Tensor, trg_word_x: Tensor, sparse_k=1, dense_mm=False, do_sinkhorn=False)
matchbench.utils.text_sim.union(mp, sa, sb, hf, now)

matchbench.utils.text_utils module

class matchbench.utils.text_utils.EntTokenInfo(ents, words, emb, w2e, e2w)

Bases: object

e2w: List[List[int]]
emb: Tensor
ent_cnt()
ents: Dict[str, int]
filter_tokens(k=25, verbose=False)
get_tf_idf(filter_eps=None, filter_tokens=None)
static load(path, *args, **kwargs)
save(path, *args, **kwargs)
static static_high_freq_words(w2e, word_list, k=25, verbose=False)
static static_punc_tokens(word_list, punc=None, verbose=False)
w2e: List[Set[int]]
word_cnt()
words: List[str]
matchbench.utils.text_utils.cpm_embedding(ent2word, words, cpm_types, models=('en', 'fr'))
matchbench.utils.text_utils.edit_dist_of(sent0, sent1, item)
matchbench.utils.text_utils.embed_word2entity(ent2word, word_emb, reduction='max') Tensor
matchbench.utils.text_utils.faiss_search_impl(emb_q, emb_id, emb_size, shift, k=50, search_batch_sz=50000, gpu=True)
matchbench.utils.text_utils.gen_mean(vals, p)
matchbench.utils.text_utils.get_batch_sim(embed, topk=50, split=True)
matchbench.utils.text_utils.get_count(words, ent_lists, binary=True)
matchbench.utils.text_utils.get_fasttext_aligned_vectors(words, device, lang)
matchbench.utils.text_utils.get_name_feature_map(sents, embedding_loader=None, device='cuda', batch_size=1024, use_fasttext=False, lang=None, **kwargs)
matchbench.utils.text_utils.get_punctuations()
matchbench.utils.text_utils.get_tf_idf(words, ent_lists, bert_tokenizer=None)
matchbench.utils.text_utils.global_level_semantic_sim(embs, k=50, search_batch_sz=50000, index_batch_sz=500000, split=False, norm=True, gpu=True)
matchbench.utils.text_utils.normalize_vectors(embeddings, center=False)
matchbench.utils.text_utils.pairwise_edit_distance(sent0, sent1, to_tensor=True)
matchbench.utils.text_utils.reduce(tensor, reduction='mean', dim=0)
matchbench.utils.text_utils.remove_prefix_to_list(entity_dict: {}, prefix='http(s)?://[a-z\\.]+/[^/]+/', punc='') []
matchbench.utils.text_utils.remove_punc(str, punc=None)
matchbench.utils.text_utils.selected_edit_distance(sent0, sent1, needed, batch_size=100000)
matchbench.utils.text_utils.tokenize(sent, tokenizer)

matchbench.utils.utils module

matchbench.utils.utils.add_cnt_for(mp, val, begin=None)
Parameters:
  • mp – dict, mp[val: int]=id: int

  • val – old_id

  • begin – cur_id

matchbench.utils.utils.add_logs(k, v)
matchbench.utils.utils.apply(func, *args)
matchbench.utils.utils.apply_on_sparse(func, tensor)
matchbench.utils.utils.argprint(**kwargs)
matchbench.utils.utils.batch_spspmm(a, b, batch_size=1000, verbose=10, filter_softmax=0.01)
matchbench.utils.utils.cosine_distance(x1, x2, eps=1e-08)
matchbench.utils.utils.cosine_sim(x1, x2=None, eps=1e-08)
matchbench.utils.utils.dense_to_sparse(x)
matchbench.utils.utils.dict_values_to_tensor(d: {}, device='cuda')
matchbench.utils.utils.filter_which(x: Tensor, **kwargs)
matchbench.utils.utils.get_iv(sps: List[Tensor])
matchbench.utils.utils.has_key(mp, k)
matchbench.utils.utils.ind2sparse(indices: Tensor, size, size2=None, dtype=torch.float32, values=None)
matchbench.utils.utils.lst_argmax(lst: List[Any], min=False)
matchbench.utils.utils.masked_minmax(a: Tensor, eps=1e-08, masked_val=0.0, in_place=True)
matchbench.utils.utils.matrix_argmax(tensor: Tensor, dim=1)
matchbench.utils.utils.matrix_argmin(tensor: Tensor, dim=1)
matchbench.utils.utils.minmax(a: Tensor, dim=-1, eps=1e-08, in_place=True) Tensor
matchbench.utils.utils.mp2list(mp, assoc=None)
matchbench.utils.utils.norm_embed(embed: Tensor) Tensor
matchbench.utils.utils.norm_process(embed: Tensor, eps=1e-05) Tensor
matchbench.utils.utils.orthogonal_projection(W: Tensor) Tensor
matchbench.utils.utils.print_size(*args, **kwargs)
matchbench.utils.utils.procrustes(emb1, emb2, link0, link1)
matchbench.utils.utils.random_split(y: Tensor, total=15000, cnt_test=9000, cnt_train=4500, dim=1, device='cuda')
matchbench.utils.utils.rdpm(total, cnt)
matchbench.utils.utils.rebuild_with_indices(sp: Tensor)
matchbench.utils.utils.remain_topk_sim(matrix: Tensor, dim=0, k=1500, split=False)
matchbench.utils.utils.resize_sparse(x: Tensor, new_size, ind_shift)
matchbench.utils.utils.save_similarity_matrix(sparse=False, **kwargs)
matchbench.utils.utils.save_vectors(fname, x, words)
matchbench.utils.utils.scatter_op(tensor: Tensor, op='sum', dim=-1, dim_size=None)
matchbench.utils.utils.seperate_index_type(graph)
matchbench.utils.utils.set_seed(seed)
matchbench.utils.utils.sparse_argmax(tensor, scatter_dim, dim=0)
matchbench.utils.utils.sparse_argmin(tensor, scatter_dim, dim=0)
matchbench.utils.utils.sparse_dense_element_wise_op(sparse: ~torch.Tensor, dense: ~torch.Tensor, op=<built-in method mul of type object>)
matchbench.utils.utils.sparse_max(tensor: Tensor, dim=-1)
matchbench.utils.utils.sparse_min(tensor: Tensor, dim=-1)
matchbench.utils.utils.sparse_minmax(a: Tensor, eps=1e-08, in_place=True) Tensor
matchbench.utils.utils.sparse_softmax(x: Tensor, dim=0)
matchbench.utils.utils.split_sp(sp: Tensor)
matchbench.utils.utils.spmm(s: Tensor, d: Tensor) Tensor
matchbench.utils.utils.spmm_ds(d: Tensor, s: Tensor) Tensor
matchbench.utils.utils.spmm_sd(s: Tensor, d: Tensor) Tensor
matchbench.utils.utils.spspmm(a, b, separate=False)
matchbench.utils.utils.to_dense(x)
matchbench.utils.utils.to_tensor(device, dtype, *args)
matchbench.utils.utils.to_torch_sparse(matrix, dtype=<class 'float'>, device='cuda')
matchbench.utils.utils.topk2spmat(val0, ind0, size, dim=0, device: device = 'cuda', split=False)
matchbench.utils.utils.update_time_logs(action: str)
matchbench.utils.utils.view2(x)
matchbench.utils.utils.view3(x: Tensor) Tensor
matchbench.utils.utils.view_back(M)
matchbench.utils.utils.z_score(embed)

Module contents