分享

基于神经网络的溶解度预测和回归分析

 DrugAI 2022-04-19

人工智能是一个主题,尝试使用神经网络作为模型建立化合物物理性质的预测模型。机器学习库是由Google开发和使用的TensorFlowKeras是一个使TensorFlow的神经网络功能更易于使用的软件包。


<数据集文件见:https://download.csdn.net/download/u012325865/10670205>


代码示例


基于神经网络的溶解度预测

  1. #导入依赖包

  2. from rdkit import Chem

  3. from rdkit.Chem.Draw import IPythonConsole

  4. from mordred import descriptors, Calculator #pip install mordred

  5. import numpy as np

  6. from sklearn.preprocessing import StandardScaler

  7. from sklearn import model_selection

  8. from keras.models import Sequential

  9. from keras.layers import Dense, Activation

  10. from keras.optimizers import SGD

  1. calc = Calculator(descriptors, ignore_3D = True)

  2. #加载数据

  3. sdf = [ mol for mol in Chem.SDMolSupplier('solubility.sdf')]

  4. #使用mordred计算sdf文件中的分子化学描述符

  5. X = calc.pandas(sdf).astype('float').dropna(axis = 1)

  1. #转换为Numpy格式数组

  2. X = np.array(X, dtype = np.float32)

  3. #转换为平均值0,每个描述符的色散1

  4. st = StandardScaler()

  5. X= st.fit_transform(X)

  6. #保存到npy文件供以后重用

  7. np.save("X_2d.npy", X)

  1. #定义读取溶解度的函数

  2. def getResponse( mols, prop= "SOL" ):

  3.    Y = []

  4.    for mol in mols:

  5.        act = mol.GetProp( prop )

  6.        Y.append( act )

  7.    return Y

  8. #从sdf文件中读取溶解度

  9. Y = getResponse(sdf)

  10. #转换为Numpy格式数组

  11. Y = np.array(Y, dtype = np.float32)

  12. #保存到npy文件供以后重用

  13. np.save("Y_2d.npy", Y)

  1. #重新随机划分训练集和测试集

  2. X_train, X_test, y_train, y_test = model_selection.train_test_split(X, Y, test_size=0.25, random_state=42)

  3. np.save("X_train.npy", X_train)

  4. np.save("X_test.npy", X_test)

  5. np.save("y_train.npy", y_train)

  6. np.save("y_test.npy", y_test)

  1. model = Sequential()

  2. #输入层。传递给下一层的维度为50 输入数据维度(input_dim)是1114

  3. model.add(Dense(units = 50, input_dim = X.shape[1]))

  4. model.add(Activation("sigmoid"))

  5. #输出层。 维度1,即输出单个值。

  6. model.add(Dense(units = 1))

  7. model.summary()

  1. #SGD是随机梯度下降法。 nesterovNesterov的加速度梯度下降法。

  2. model.compile(loss = 'mean_squared_error',

  3.    optimizer = SGD(lr = 0.01, momentum = 0.9, nesterov = True),

  4.    metrics=['accuracy'])

  5. history = model.fit(X_train, y_train, epochs = 100, batch_size = 32,

  6.    validation_data = (X_test, y_test))

  7. score = model.evaluate(X_test, y_test, verbose = 0)

  8. print('Test loss:', score[0])

  9. print('Test accuracy:', score[1])

  10. y_pred = model.predict(X_test)

  11. rms = (np.mean((y_test - y_pred) ** 2)) ** 0.5

  12. #s = np.std(y_test - y_pred)

  13. print("Neural Network RMS", rms)

  1. %matplotlib inline

  2. import matplotlib.pyplot as plt

  3. plt.figure()

  4. plt.scatter(y_train, model.predict(X_train), label = 'Train', c = 'blue')

  5. plt.title('Neural Network Predictor')

  6. plt.xlabel('Measured Solubility')

  7. plt.ylabel('Predicted Solubility')

  8. plt.scatter(y_test, model.predict(X_test), c = 'lightgreen', label = 'Test', alpha = 0.8)

  9. plt.legend(loc = 4)

  10. plt.savefig('Neural Network Predictor.png', dpi=300)

  11. plt.show()

  1. import matplotlib.pyplot as plt

  2. loss = history.history['loss']

  3. val_loss = history.history['val_loss']

  4. epochs = len(loss)

  5. plt.plot(range(epochs), loss, marker = '.', label = 'loss')

  6. plt.plot(range(epochs), val_loss, marker = '.', label = 'val_loss')

  7. plt.legend(loc = 'best')

  8. plt.grid()

  9. plt.xlabel('epoch')

  10. plt.ylabel('loss')

  11. plt.show()

  1. model.compile(loss = 'mean_squared_error',

  2.    optimizer = SGD(lr = 0.01, momentum = 0.9, nesterov = True),

  3.    metrics=['accuracy'])

  4. from keras.callbacks import EarlyStopping

  5. history = model.fit(X_train, y_train, epochs = 100, batch_size = 32,

  6.    validation_data=(X_test, y_test), callbacks = [EarlyStopping()])

  7. score = model.evaluate(X_test, y_test, verbose = 0)

  8. print('Test loss:', score[0])

  9. print('Test accuracy:', score[1])

  10. y_pred = model.predict(X_test)

  11. rms = (np.mean((y_test - y_pred) ** 2)) ** 0.5

  12. #s = np.std(y_test - y_pred)

  13. print("Neural Network RMS", rms)

PLSR分析:偏最小二乘回归法分析

  1. import numpy as np

  2. from sklearn.preprocessing import StandardScaler

  3. from sklearn import model_selection

  4. from sklearn.metrics import mean_squared_error

  5. from sklearn.metrics import r2_score

  6. from sklearn.cross_decomposition import PLSRegression

  7. import sklearn

  8. print("sklearn ver.", sklearn.__version__)

  9. print("numpy ver.", np.__version__)

  1. #加载保存的数据文件

  2. X = np.load("X_2d.npy")

  3. Y = np.load("Y_2d.npy")

  4. #随机划分训练集和测试集

  5. X_train, X_test, y_train, y_test = model_selection.train_test_split(X,

  6.    Y, test_size = 0.25, random_state = 42)

  1. #计算解释溶解度分散的因子并使用多达15的因子进行回归分析。

  2. pls2 = PLSRegression(n_components = 15, scale = True)

  3. pls2.fit(X_train, y_train)

  4. pred_train = pls2.predict(X_train)

  5. pred_test = pls2.predict(X_test)

  6. rms = (np.mean((y_test - pred_test)**2))**0.5

  7. #s = np.std(y_test - y_pred)

  8. print("PLS regression RMS", rms)

  1. PLS regression RMS 2.834230670918034

  1. import pylab as plt

  2. plt.figure()

  3. plt.scatter(y_train, pred_train, label = 'Train', c = 'blue')

  4. plt.title('PLSR Predictor')

  5. plt.xlabel('Measured Solubility')

  6. plt.ylabel('Predicted Solubility')

  7. plt.scatter(y_test, pred_test, c = 'lightgreen', label = 'Test', alpha = 0.8)

  8. plt.legend(loc = 4)

  9. plt.savefig('PLSR Predictor.png', dpi=300)

  10. plt.show()


参考资料:

http://www.ag./charlesy/2017/07/21/keras%E3%81%A7%E5%8C%96%E5%90%88%E7%89%A9%E3%81%AE%E6%BA%B6%E8%A7%A3%E5%BA%A6%E4%BA%88%E6%B8%AC%EF%BC%88%E3%83%8B%E3%83%A5%E3%83%BC%E3%83%A9%E3%83%AB%E3%83%8D%E3%83%83%E3%83%88%E3%83%AF%E3%83%BC/


    转藏 分享 献花(0

    0条评论

    发表

    请遵守用户 评论公约

    类似文章 更多