class Corpus[source]

Corpus(infile, tokenizer, isdir=False, dropout=0)

filename: A file that stores SMILES line-by-line.

tokenizer: SPE tokenizer

dropout: SPE dropout, default = 0

learn_spe2vec[source]

learn_spe2vec(corpus, outfile=None, vector_size=100, window=10, min_count=10, n_jobs=1, method='skip-gram', **kwargs)

Train a spe2vec model.

corpus: an instance of Class Corpus()

outfile: str, name of the spe2vec model file.

vector_size: dimensions of embedding.

window: number of tokens considered as context

min_count: number of occurrences a token should have to be considered in training

n_jobs: number of cpu cores used for training.

method: modeling method, choose from ['cbow', 'skip-gram']

More training parameter can be found https://radimrehurek.com/gensim/models/word2vec.html#gensim.models.word2vec.Word2Vec

load_spe2vec[source]

load_spe2vec(model_path)

class SPE2Vec[source]

SPE2Vec(model_path, tokenizer)

s.tokenize('c1ccccc1C')
'c1ccccc1 C'
s.spe2vec('c1ccccc1C')
[array([ 0.00324177, -0.18124679,  0.1894573 ,  0.29736474, -0.14143717,
        -0.03290153, -0.31891045,  0.16373567, -0.12413523, -0.08658446,
        -0.23956653,  0.05335753,  0.18146366, -0.17212407, -0.17879114,
        -0.01039552, -0.00274071,  0.01653983,  0.08432296, -0.15634526,
         0.29629305, -0.16786121,  0.06479991,  0.34462902, -0.11052489,
        -0.13513446,  0.16418819, -0.21508686, -0.01842665, -0.15818536,
        -0.05421342,  0.2041645 ,  0.14783993, -0.00653112, -0.19034739,
        -0.11876111,  0.12208337, -0.0743893 ,  0.03400969,  0.04422404,
        -0.10224582,  0.34490895,  0.12326851, -0.08695894, -0.08150315,
         0.09907438,  0.28797793,  0.15912676,  0.15228626, -0.164707  ,
         0.33839643, -0.04265443, -0.11858924,  0.10059267, -0.24335982,
        -0.02948368,  0.53029126,  0.2448303 ,  0.11335112,  0.01153868,
        -0.01010862, -0.06406022, -0.01338368, -0.18424016,  0.03580371,
         0.18463984,  0.15326728, -0.15144381, -0.0136445 , -0.01842183,
        -0.01942809, -0.40844846, -0.3803786 ,  0.06027165,  0.1963685 ,
         0.17956594,  0.43164128,  0.15884452, -0.05903239, -0.12084594,
        -0.3421759 ,  0.09406078, -0.1743247 , -0.07216409, -0.36593992,
         0.40148914,  0.06790256,  0.00517231, -0.03673836,  0.15515997,
         0.05461619,  0.34084585,  0.18900603, -0.02054791, -0.3929679 ,
         0.02184797,  0.63548833,  0.15527408, -0.04567065, -0.04842073],
       dtype=float32),
 array([-0.03004633,  0.21360289,  0.15284857,  0.24769907, -0.22398064,
        -0.1678496 , -0.3093915 , -0.16064537,  0.01739492,  0.16211355,
         0.07881376,  0.01900313,  0.22551589, -0.11012595,  0.04134884,
         0.08953694,  0.00311358, -0.01172279,  0.27477577, -0.3133619 ,
         0.21313639, -0.03454831, -0.0355614 ,  0.29706472, -0.14337239,
         0.06306618, -0.20414938,  0.08350278,  0.24285245, -0.03777679,
        -0.08204563,  0.42742103,  0.04623334,  0.0582608 , -0.04253339,
        -0.3693069 , -0.08587249, -0.07318146,  0.02444837, -0.17312065,
         0.1859736 , -0.13985237,  0.06309649, -0.06333318, -0.25265425,
        -0.20349394,  0.16175316, -0.24759755,  0.06637027, -0.1069316 ,
         0.22224535,  0.10144968, -0.14347847, -0.2009012 ,  0.3089581 ,
        -0.08966508,  0.07063737, -0.1094429 ,  0.20200677,  0.08720575,
        -0.0347297 , -0.24422231, -0.03246196, -0.2535116 ,  0.08952698,
        -0.09993225,  0.03947218, -0.00299238,  0.42530695, -0.28596228,
         0.09581468,  0.07731544,  0.10829563,  0.2028767 ,  0.02469129,
         0.07771173,  0.10135574,  0.19251762,  0.22314067,  0.07240579,
         0.01130728,  0.11720331, -0.134948  , -0.03010134,  0.06649083,
         0.03310914,  0.42079785, -0.03915739,  0.01902897,  0.01007993,
         0.01172702, -0.08903081, -0.03384162, -0.07326104, -0.06739528,
         0.13880825,  0.16431451, -0.00987516, -0.22946021, -0.08215933],
       dtype=float32)]
s.smiles2vec('c1ccccc1C', mode = 'average')
array([-1.34022804e-02,  1.61780491e-02,  1.71152934e-01,  2.72531897e-01,
       -1.82708904e-01, -1.00375563e-01, -3.14150989e-01,  1.54515356e-03,
       -5.33701554e-02,  3.77645455e-02, -8.03763866e-02,  3.61803323e-02,
        2.03489780e-01, -1.41125008e-01, -6.87211454e-02,  3.95707041e-02,
        1.86435529e-04,  2.40851752e-03,  1.79549366e-01, -2.34853595e-01,
        2.54714727e-01, -1.01204760e-01,  1.46192573e-02,  3.20846856e-01,
       -1.26948640e-01, -3.60341370e-02, -1.99805945e-02, -6.57920390e-02,
        1.12212896e-01, -9.79810804e-02, -6.81295246e-02,  3.15792769e-01,
        9.70366374e-02,  2.58648433e-02, -1.16440386e-01, -2.44033992e-01,
        1.81054398e-02, -7.37853795e-02,  2.92290300e-02, -6.44483045e-02,
        4.18638885e-02,  1.02528289e-01,  9.31825042e-02, -7.51460642e-02,
       -1.67078704e-01, -5.22097796e-02,  2.24865556e-01, -4.42353934e-02,
        1.09328270e-01, -1.35819301e-01,  2.80320883e-01,  2.93976273e-02,
       -1.31033853e-01, -5.01542650e-02,  3.27991471e-02, -5.95743805e-02,
        3.00464302e-01,  6.76936954e-02,  1.57678947e-01,  4.93722111e-02,
       -2.24191621e-02, -1.54141262e-01, -2.29228213e-02, -2.18875885e-01,
        6.26653433e-02,  4.23537977e-02,  9.63697284e-02, -7.72181004e-02,
        2.05831230e-01, -1.52192056e-01,  3.81932929e-02, -1.65566504e-01,
       -1.36041492e-01,  1.31574184e-01,  1.10529892e-01,  1.28638834e-01,
        2.66498506e-01,  1.75681069e-01,  8.20541382e-02, -2.42200755e-02,
       -1.65434316e-01,  1.05632044e-01, -1.54636353e-01, -5.11327162e-02,
       -1.49724543e-01,  2.17299134e-01,  2.44350210e-01, -1.69925392e-02,
       -8.85469373e-03,  8.26199502e-02,  3.31716053e-02,  1.25907525e-01,
        7.75822103e-02, -4.69044782e-02, -2.30181590e-01,  8.03281143e-02,
        3.99901420e-01,  7.26994574e-02, -1.37565434e-01, -6.52900264e-02],
      dtype=float32)
s.smiles2vec('c1ccccc1C', mode = 'sum')
array([-2.6804561e-02,  3.2356098e-02,  3.4230587e-01,  5.4506379e-01,
       -3.6541781e-01, -2.0075113e-01, -6.2830198e-01,  3.0903071e-03,
       -1.0674031e-01,  7.5529091e-02, -1.6075277e-01,  7.2360665e-02,
        4.0697956e-01, -2.8225002e-01, -1.3744229e-01,  7.9141408e-02,
        3.7287106e-04,  4.8170350e-03,  3.5909873e-01, -4.6970719e-01,
        5.0942945e-01, -2.0240952e-01,  2.9238515e-02,  6.4169371e-01,
       -2.5389728e-01, -7.2068274e-02, -3.9961189e-02, -1.3158408e-01,
        2.2442579e-01, -1.9596216e-01, -1.3625905e-01,  6.3158554e-01,
        1.9407327e-01,  5.1729687e-02, -2.3288077e-01, -4.8806798e-01,
        3.6210880e-02, -1.4757076e-01,  5.8458060e-02, -1.2889661e-01,
        8.3727777e-02,  2.0505658e-01,  1.8636501e-01, -1.5029213e-01,
       -3.3415741e-01, -1.0441956e-01,  4.4973111e-01, -8.8470787e-02,
        2.1865654e-01, -2.7163860e-01,  5.6064177e-01,  5.8795255e-02,
       -2.6206771e-01, -1.0030853e-01,  6.5598294e-02, -1.1914876e-01,
        6.0092860e-01,  1.3538739e-01,  3.1535789e-01,  9.8744422e-02,
       -4.4838324e-02, -3.0828252e-01, -4.5845643e-02, -4.3775177e-01,
        1.2533069e-01,  8.4707595e-02,  1.9273946e-01, -1.5443620e-01,
        4.1166246e-01, -3.0438411e-01,  7.6386586e-02, -3.3113301e-01,
       -2.7208298e-01,  2.6314837e-01,  2.2105978e-01,  2.5727767e-01,
        5.3299701e-01,  3.5136214e-01,  1.6410828e-01, -4.8440151e-02,
       -3.3086863e-01,  2.1126409e-01, -3.0927271e-01, -1.0226543e-01,
       -2.9944909e-01,  4.3459827e-01,  4.8870042e-01, -3.3985078e-02,
       -1.7709387e-02,  1.6523990e-01,  6.6343211e-02,  2.5181505e-01,
        1.5516442e-01, -9.3808956e-02, -4.6036318e-01,  1.6065623e-01,
        7.9980284e-01,  1.4539891e-01, -2.7513087e-01, -1.3058005e-01],
      dtype=float32)
s.smiles2vec('c1ccccc1C', mode = 'avg_pool').shape
(300,)