class Corpus[source]
Corpus(infile,tokenizer,isdir=False,dropout=0)
filename: A file that stores SMILES line-by-line.
tokenizer: SPE tokenizer
dropout: SPE dropout, default = 0
learn_spe2vec[source]
learn_spe2vec(corpus,outfile=None,vector_size=100,window=10,min_count=10,n_jobs=1,method='skip-gram', **kwargs)
Train a spe2vec model.
corpus: an instance of Class Corpus()
outfile: str, name of the spe2vec model file.
vector_size: dimensions of embedding.
window: number of tokens considered as context
min_count: number of occurrences a token should have to be considered in training
n_jobs: number of cpu cores used for training.
method: modeling method, choose from ['cbow', 'skip-gram']
More training parameter can be found https://radimrehurek.com/gensim/models/word2vec.html#gensim.models.word2vec.Word2Vec
load_spe2vec[source]
load_spe2vec(model_path)
class SPE2Vec[source]
SPE2Vec(model_path,tokenizer)
s.tokenize('c1ccccc1C')
'c1ccccc1 C'
s.spe2vec('c1ccccc1C')
[array([ 0.00324177, -0.18124679, 0.1894573 , 0.29736474, -0.14143717,
-0.03290153, -0.31891045, 0.16373567, -0.12413523, -0.08658446,
-0.23956653, 0.05335753, 0.18146366, -0.17212407, -0.17879114,
-0.01039552, -0.00274071, 0.01653983, 0.08432296, -0.15634526,
0.29629305, -0.16786121, 0.06479991, 0.34462902, -0.11052489,
-0.13513446, 0.16418819, -0.21508686, -0.01842665, -0.15818536,
-0.05421342, 0.2041645 , 0.14783993, -0.00653112, -0.19034739,
-0.11876111, 0.12208337, -0.0743893 , 0.03400969, 0.04422404,
-0.10224582, 0.34490895, 0.12326851, -0.08695894, -0.08150315,
0.09907438, 0.28797793, 0.15912676, 0.15228626, -0.164707 ,
0.33839643, -0.04265443, -0.11858924, 0.10059267, -0.24335982,
-0.02948368, 0.53029126, 0.2448303 , 0.11335112, 0.01153868,
-0.01010862, -0.06406022, -0.01338368, -0.18424016, 0.03580371,
0.18463984, 0.15326728, -0.15144381, -0.0136445 , -0.01842183,
-0.01942809, -0.40844846, -0.3803786 , 0.06027165, 0.1963685 ,
0.17956594, 0.43164128, 0.15884452, -0.05903239, -0.12084594,
-0.3421759 , 0.09406078, -0.1743247 , -0.07216409, -0.36593992,
0.40148914, 0.06790256, 0.00517231, -0.03673836, 0.15515997,
0.05461619, 0.34084585, 0.18900603, -0.02054791, -0.3929679 ,
0.02184797, 0.63548833, 0.15527408, -0.04567065, -0.04842073],
dtype=float32),
array([-0.03004633, 0.21360289, 0.15284857, 0.24769907, -0.22398064,
-0.1678496 , -0.3093915 , -0.16064537, 0.01739492, 0.16211355,
0.07881376, 0.01900313, 0.22551589, -0.11012595, 0.04134884,
0.08953694, 0.00311358, -0.01172279, 0.27477577, -0.3133619 ,
0.21313639, -0.03454831, -0.0355614 , 0.29706472, -0.14337239,
0.06306618, -0.20414938, 0.08350278, 0.24285245, -0.03777679,
-0.08204563, 0.42742103, 0.04623334, 0.0582608 , -0.04253339,
-0.3693069 , -0.08587249, -0.07318146, 0.02444837, -0.17312065,
0.1859736 , -0.13985237, 0.06309649, -0.06333318, -0.25265425,
-0.20349394, 0.16175316, -0.24759755, 0.06637027, -0.1069316 ,
0.22224535, 0.10144968, -0.14347847, -0.2009012 , 0.3089581 ,
-0.08966508, 0.07063737, -0.1094429 , 0.20200677, 0.08720575,
-0.0347297 , -0.24422231, -0.03246196, -0.2535116 , 0.08952698,
-0.09993225, 0.03947218, -0.00299238, 0.42530695, -0.28596228,
0.09581468, 0.07731544, 0.10829563, 0.2028767 , 0.02469129,
0.07771173, 0.10135574, 0.19251762, 0.22314067, 0.07240579,
0.01130728, 0.11720331, -0.134948 , -0.03010134, 0.06649083,
0.03310914, 0.42079785, -0.03915739, 0.01902897, 0.01007993,
0.01172702, -0.08903081, -0.03384162, -0.07326104, -0.06739528,
0.13880825, 0.16431451, -0.00987516, -0.22946021, -0.08215933],
dtype=float32)]
s.smiles2vec('c1ccccc1C', mode = 'average')
array([-1.34022804e-02, 1.61780491e-02, 1.71152934e-01, 2.72531897e-01,
-1.82708904e-01, -1.00375563e-01, -3.14150989e-01, 1.54515356e-03,
-5.33701554e-02, 3.77645455e-02, -8.03763866e-02, 3.61803323e-02,
2.03489780e-01, -1.41125008e-01, -6.87211454e-02, 3.95707041e-02,
1.86435529e-04, 2.40851752e-03, 1.79549366e-01, -2.34853595e-01,
2.54714727e-01, -1.01204760e-01, 1.46192573e-02, 3.20846856e-01,
-1.26948640e-01, -3.60341370e-02, -1.99805945e-02, -6.57920390e-02,
1.12212896e-01, -9.79810804e-02, -6.81295246e-02, 3.15792769e-01,
9.70366374e-02, 2.58648433e-02, -1.16440386e-01, -2.44033992e-01,
1.81054398e-02, -7.37853795e-02, 2.92290300e-02, -6.44483045e-02,
4.18638885e-02, 1.02528289e-01, 9.31825042e-02, -7.51460642e-02,
-1.67078704e-01, -5.22097796e-02, 2.24865556e-01, -4.42353934e-02,
1.09328270e-01, -1.35819301e-01, 2.80320883e-01, 2.93976273e-02,
-1.31033853e-01, -5.01542650e-02, 3.27991471e-02, -5.95743805e-02,
3.00464302e-01, 6.76936954e-02, 1.57678947e-01, 4.93722111e-02,
-2.24191621e-02, -1.54141262e-01, -2.29228213e-02, -2.18875885e-01,
6.26653433e-02, 4.23537977e-02, 9.63697284e-02, -7.72181004e-02,
2.05831230e-01, -1.52192056e-01, 3.81932929e-02, -1.65566504e-01,
-1.36041492e-01, 1.31574184e-01, 1.10529892e-01, 1.28638834e-01,
2.66498506e-01, 1.75681069e-01, 8.20541382e-02, -2.42200755e-02,
-1.65434316e-01, 1.05632044e-01, -1.54636353e-01, -5.11327162e-02,
-1.49724543e-01, 2.17299134e-01, 2.44350210e-01, -1.69925392e-02,
-8.85469373e-03, 8.26199502e-02, 3.31716053e-02, 1.25907525e-01,
7.75822103e-02, -4.69044782e-02, -2.30181590e-01, 8.03281143e-02,
3.99901420e-01, 7.26994574e-02, -1.37565434e-01, -6.52900264e-02],
dtype=float32)
s.smiles2vec('c1ccccc1C', mode = 'sum')
array([-2.6804561e-02, 3.2356098e-02, 3.4230587e-01, 5.4506379e-01,
-3.6541781e-01, -2.0075113e-01, -6.2830198e-01, 3.0903071e-03,
-1.0674031e-01, 7.5529091e-02, -1.6075277e-01, 7.2360665e-02,
4.0697956e-01, -2.8225002e-01, -1.3744229e-01, 7.9141408e-02,
3.7287106e-04, 4.8170350e-03, 3.5909873e-01, -4.6970719e-01,
5.0942945e-01, -2.0240952e-01, 2.9238515e-02, 6.4169371e-01,
-2.5389728e-01, -7.2068274e-02, -3.9961189e-02, -1.3158408e-01,
2.2442579e-01, -1.9596216e-01, -1.3625905e-01, 6.3158554e-01,
1.9407327e-01, 5.1729687e-02, -2.3288077e-01, -4.8806798e-01,
3.6210880e-02, -1.4757076e-01, 5.8458060e-02, -1.2889661e-01,
8.3727777e-02, 2.0505658e-01, 1.8636501e-01, -1.5029213e-01,
-3.3415741e-01, -1.0441956e-01, 4.4973111e-01, -8.8470787e-02,
2.1865654e-01, -2.7163860e-01, 5.6064177e-01, 5.8795255e-02,
-2.6206771e-01, -1.0030853e-01, 6.5598294e-02, -1.1914876e-01,
6.0092860e-01, 1.3538739e-01, 3.1535789e-01, 9.8744422e-02,
-4.4838324e-02, -3.0828252e-01, -4.5845643e-02, -4.3775177e-01,
1.2533069e-01, 8.4707595e-02, 1.9273946e-01, -1.5443620e-01,
4.1166246e-01, -3.0438411e-01, 7.6386586e-02, -3.3113301e-01,
-2.7208298e-01, 2.6314837e-01, 2.2105978e-01, 2.5727767e-01,
5.3299701e-01, 3.5136214e-01, 1.6410828e-01, -4.8440151e-02,
-3.3086863e-01, 2.1126409e-01, -3.0927271e-01, -1.0226543e-01,
-2.9944909e-01, 4.3459827e-01, 4.8870042e-01, -3.3985078e-02,
-1.7709387e-02, 1.6523990e-01, 6.6343211e-02, 2.5181505e-01,
1.5516442e-01, -9.3808956e-02, -4.6036318e-01, 1.6065623e-01,
7.9980284e-01, 1.4539891e-01, -2.7513087e-01, -1.3058005e-01],
dtype=float32)
s.smiles2vec('c1ccccc1C', mode = 'avg_pool').shape
(300,)