class
Corpus
[source]
Corpus
(infile
,tokenizer
,isdir
=False
,dropout
=0
)
filename: A file that stores SMILES line-by-line.
tokenizer: SPE tokenizer
dropout: SPE dropout, default = 0
learn_spe2vec
[source]
learn_spe2vec
(corpus
,outfile
=None
,vector_size
=100
,window
=10
,min_count
=10
,n_jobs
=1
,method
='skip-gram'
, **kwargs
)
Train a spe2vec model.
corpus: an instance of Class Corpus()
outfile: str, name of the spe2vec model file.
vector_size: dimensions of embedding.
window: number of tokens considered as context
min_count: number of occurrences a token should have to be considered in training
n_jobs: number of cpu cores used for training.
method: modeling method, choose from ['cbow', 'skip-gram']
More training parameter can be found https://radimrehurek.com/gensim/models/word2vec.html#gensim.models.word2vec.Word2Vec
load_spe2vec
[source]
load_spe2vec
(model_path
)
class
SPE2Vec
[source]
SPE2Vec
(model_path
,tokenizer
)
s.tokenize('c1ccccc1C')
'c1ccccc1 C'
s.spe2vec('c1ccccc1C')
[array([ 0.00324177, -0.18124679, 0.1894573 , 0.29736474, -0.14143717, -0.03290153, -0.31891045, 0.16373567, -0.12413523, -0.08658446, -0.23956653, 0.05335753, 0.18146366, -0.17212407, -0.17879114, -0.01039552, -0.00274071, 0.01653983, 0.08432296, -0.15634526, 0.29629305, -0.16786121, 0.06479991, 0.34462902, -0.11052489, -0.13513446, 0.16418819, -0.21508686, -0.01842665, -0.15818536, -0.05421342, 0.2041645 , 0.14783993, -0.00653112, -0.19034739, -0.11876111, 0.12208337, -0.0743893 , 0.03400969, 0.04422404, -0.10224582, 0.34490895, 0.12326851, -0.08695894, -0.08150315, 0.09907438, 0.28797793, 0.15912676, 0.15228626, -0.164707 , 0.33839643, -0.04265443, -0.11858924, 0.10059267, -0.24335982, -0.02948368, 0.53029126, 0.2448303 , 0.11335112, 0.01153868, -0.01010862, -0.06406022, -0.01338368, -0.18424016, 0.03580371, 0.18463984, 0.15326728, -0.15144381, -0.0136445 , -0.01842183, -0.01942809, -0.40844846, -0.3803786 , 0.06027165, 0.1963685 , 0.17956594, 0.43164128, 0.15884452, -0.05903239, -0.12084594, -0.3421759 , 0.09406078, -0.1743247 , -0.07216409, -0.36593992, 0.40148914, 0.06790256, 0.00517231, -0.03673836, 0.15515997, 0.05461619, 0.34084585, 0.18900603, -0.02054791, -0.3929679 , 0.02184797, 0.63548833, 0.15527408, -0.04567065, -0.04842073], dtype=float32), array([-0.03004633, 0.21360289, 0.15284857, 0.24769907, -0.22398064, -0.1678496 , -0.3093915 , -0.16064537, 0.01739492, 0.16211355, 0.07881376, 0.01900313, 0.22551589, -0.11012595, 0.04134884, 0.08953694, 0.00311358, -0.01172279, 0.27477577, -0.3133619 , 0.21313639, -0.03454831, -0.0355614 , 0.29706472, -0.14337239, 0.06306618, -0.20414938, 0.08350278, 0.24285245, -0.03777679, -0.08204563, 0.42742103, 0.04623334, 0.0582608 , -0.04253339, -0.3693069 , -0.08587249, -0.07318146, 0.02444837, -0.17312065, 0.1859736 , -0.13985237, 0.06309649, -0.06333318, -0.25265425, -0.20349394, 0.16175316, -0.24759755, 0.06637027, -0.1069316 , 0.22224535, 0.10144968, -0.14347847, -0.2009012 , 0.3089581 , -0.08966508, 0.07063737, -0.1094429 , 0.20200677, 0.08720575, -0.0347297 , -0.24422231, -0.03246196, -0.2535116 , 0.08952698, -0.09993225, 0.03947218, -0.00299238, 0.42530695, -0.28596228, 0.09581468, 0.07731544, 0.10829563, 0.2028767 , 0.02469129, 0.07771173, 0.10135574, 0.19251762, 0.22314067, 0.07240579, 0.01130728, 0.11720331, -0.134948 , -0.03010134, 0.06649083, 0.03310914, 0.42079785, -0.03915739, 0.01902897, 0.01007993, 0.01172702, -0.08903081, -0.03384162, -0.07326104, -0.06739528, 0.13880825, 0.16431451, -0.00987516, -0.22946021, -0.08215933], dtype=float32)]
s.smiles2vec('c1ccccc1C', mode = 'average')
array([-1.34022804e-02, 1.61780491e-02, 1.71152934e-01, 2.72531897e-01, -1.82708904e-01, -1.00375563e-01, -3.14150989e-01, 1.54515356e-03, -5.33701554e-02, 3.77645455e-02, -8.03763866e-02, 3.61803323e-02, 2.03489780e-01, -1.41125008e-01, -6.87211454e-02, 3.95707041e-02, 1.86435529e-04, 2.40851752e-03, 1.79549366e-01, -2.34853595e-01, 2.54714727e-01, -1.01204760e-01, 1.46192573e-02, 3.20846856e-01, -1.26948640e-01, -3.60341370e-02, -1.99805945e-02, -6.57920390e-02, 1.12212896e-01, -9.79810804e-02, -6.81295246e-02, 3.15792769e-01, 9.70366374e-02, 2.58648433e-02, -1.16440386e-01, -2.44033992e-01, 1.81054398e-02, -7.37853795e-02, 2.92290300e-02, -6.44483045e-02, 4.18638885e-02, 1.02528289e-01, 9.31825042e-02, -7.51460642e-02, -1.67078704e-01, -5.22097796e-02, 2.24865556e-01, -4.42353934e-02, 1.09328270e-01, -1.35819301e-01, 2.80320883e-01, 2.93976273e-02, -1.31033853e-01, -5.01542650e-02, 3.27991471e-02, -5.95743805e-02, 3.00464302e-01, 6.76936954e-02, 1.57678947e-01, 4.93722111e-02, -2.24191621e-02, -1.54141262e-01, -2.29228213e-02, -2.18875885e-01, 6.26653433e-02, 4.23537977e-02, 9.63697284e-02, -7.72181004e-02, 2.05831230e-01, -1.52192056e-01, 3.81932929e-02, -1.65566504e-01, -1.36041492e-01, 1.31574184e-01, 1.10529892e-01, 1.28638834e-01, 2.66498506e-01, 1.75681069e-01, 8.20541382e-02, -2.42200755e-02, -1.65434316e-01, 1.05632044e-01, -1.54636353e-01, -5.11327162e-02, -1.49724543e-01, 2.17299134e-01, 2.44350210e-01, -1.69925392e-02, -8.85469373e-03, 8.26199502e-02, 3.31716053e-02, 1.25907525e-01, 7.75822103e-02, -4.69044782e-02, -2.30181590e-01, 8.03281143e-02, 3.99901420e-01, 7.26994574e-02, -1.37565434e-01, -6.52900264e-02], dtype=float32)
s.smiles2vec('c1ccccc1C', mode = 'sum')
array([-2.6804561e-02, 3.2356098e-02, 3.4230587e-01, 5.4506379e-01, -3.6541781e-01, -2.0075113e-01, -6.2830198e-01, 3.0903071e-03, -1.0674031e-01, 7.5529091e-02, -1.6075277e-01, 7.2360665e-02, 4.0697956e-01, -2.8225002e-01, -1.3744229e-01, 7.9141408e-02, 3.7287106e-04, 4.8170350e-03, 3.5909873e-01, -4.6970719e-01, 5.0942945e-01, -2.0240952e-01, 2.9238515e-02, 6.4169371e-01, -2.5389728e-01, -7.2068274e-02, -3.9961189e-02, -1.3158408e-01, 2.2442579e-01, -1.9596216e-01, -1.3625905e-01, 6.3158554e-01, 1.9407327e-01, 5.1729687e-02, -2.3288077e-01, -4.8806798e-01, 3.6210880e-02, -1.4757076e-01, 5.8458060e-02, -1.2889661e-01, 8.3727777e-02, 2.0505658e-01, 1.8636501e-01, -1.5029213e-01, -3.3415741e-01, -1.0441956e-01, 4.4973111e-01, -8.8470787e-02, 2.1865654e-01, -2.7163860e-01, 5.6064177e-01, 5.8795255e-02, -2.6206771e-01, -1.0030853e-01, 6.5598294e-02, -1.1914876e-01, 6.0092860e-01, 1.3538739e-01, 3.1535789e-01, 9.8744422e-02, -4.4838324e-02, -3.0828252e-01, -4.5845643e-02, -4.3775177e-01, 1.2533069e-01, 8.4707595e-02, 1.9273946e-01, -1.5443620e-01, 4.1166246e-01, -3.0438411e-01, 7.6386586e-02, -3.3113301e-01, -2.7208298e-01, 2.6314837e-01, 2.2105978e-01, 2.5727767e-01, 5.3299701e-01, 3.5136214e-01, 1.6410828e-01, -4.8440151e-02, -3.3086863e-01, 2.1126409e-01, -3.0927271e-01, -1.0226543e-01, -2.9944909e-01, 4.3459827e-01, 4.8870042e-01, -3.3985078e-02, -1.7709387e-02, 1.6523990e-01, 6.6343211e-02, 2.5181505e-01, 1.5516442e-01, -9.3808956e-02, -4.6036318e-01, 1.6065623e-01, 7.9980284e-01, 1.4539891e-01, -2.7513087e-01, -1.3058005e-01], dtype=float32)
s.smiles2vec('c1ccccc1C', mode = 'avg_pool').shape
(300,)