I have a data frame like this, of DNA sequences:
Feature Label
GCTAGATGACAGT 0
TTTTAAAACAG 1
TAGCTATACT 2
TGGGGCAAAAAAAA 0
AATGTCG 3
AATGTCG 0
AATGTCG 1
Where there is one column with a DNA sequence, and a label that can either be 0,1,2,3 (i.e. a category of that DNA sequence). I want to develop a NN that predicts probability of classification of each sequence into the 1,2 or 3 category (not 0, i don't care about 0). Each sequence can appear multiple times in the data frame, and it is possible that each sequence appears in multiple (or all) categories. So the output should look like this:
GCTAGATGACAGT (0.9,0.1,0.2)
TTTTAAAACAG (0.7,0.6,0.3)
TAGCTATACT (0.3,0.3,0.2)
TGGGGCAAAAAAAA (0.1,0.5,0.6)
Where the numbers in the tuple are the probability that the sequence is found in category 1,2 and 3.
I'm having a basic problem of converting the text to numeric array for input into keras Sequential():
import numpy
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from sklearn.model_selection import StratifiedKFold
from keras.callbacks import EarlyStopping, ModelCheckpoint
import os
from random import random
from numpy import array
from numpy import cumsum
import pandas as pd
from keras.layers import TimeDistributed
from keras.layers import Bidirectional
from keras.preprocessing.text import Tokenizer
os.environ['KMP_DUPLICATE_LIB_OK']='True'
%matplotlib
from sklearn.feature_extraction.text import CountVectorizer
# fix random seed for reproducibility
seed = numpy.random.seed(7)
max_words = 10000
# load the data set into a data frame
df = pd.read_csv("test_dataset.csv")
# define 10-fold cross validation test harness
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
X = df.iloc[:,[0]]
y = df.iloc[:,-1]
kf = kfold.get_n_splits(X)
cvscores = []
for train, test in kfold.split(X, y):
X_train, X_test = X[train], X[test]
y_train, y_test = y[train], y[test]
# sequences = tokenizer.texts_to_sequences(X_train)
# data = sequence.pad_sequences(sequences, maxlen= 100000)
tokenizer = Tokenizer(num_words=max_words)
x_train = tokenizer.sequences_to_matrix(X_train, mode='binary')
x_test = tokenizer.sequences_to_matrix(X_test, mode='binary')
# y_train = keras.utils.to_categorical(y_train, num_classes)
# y_test = keras.utils.to_categorical(y_test, num_classes)
The error: KeyError: '[ 0 1 3 ... 62286 62287 62288] not in index'
You can see here I've tried a couple of ways (1) using tokenizer.sequences_to_matrix (and you get the error above) or (2) using texts_to_sequences (but you get the same error)
Can someone show me how I'm meant to be converting each sequence to an input suitable for keras (it's my first ever NN so an example would be great). Where I planned to go next with this is something like this (but this is untested as struggling to read sequences into the model at the minute):
# create model
model = Sequential()
# model.add(Embedding(3000, 32, input_length=30))
# model.add(Bidirectional(LSTM(20, return_sequences=True), input_shape=(n_timesteps, 1)))
model.add(Dense(1, activation='sigmoid'))
# Compile model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# Monitor val accuracy and perform early stopping
# es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=200)
# mc = ModelCheckpoint('best_model.h5', monitor='val_accuracy', mode='max', verbose=1, save_best_only=True)
# Fit the model
model.fit(X_train, y_train, epochs=150, batch_size=10, verbose=0)
# Evaluate the model
# scores = model.evaluate(X[test], Y[test], verbose=0)
# print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
# cvscores.append(scores[1] * 100)
#print("%.2f%% (+/- %.2f%%)" % (numpy.mean(cvscores), numpy.std(cvscores)))
Update 1: I was wondering was I meant to convert sequences to matrix outside of the kfold loop like this:
# define 10-fold cross validation test harness
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
X = df.iloc[:,[0]].values
y = df.iloc[:,-1].values
kf = kfold.get_n_splits(X)
cvscores = []
tokenizer = Tokenizer(num_words=1000000)
X = tokenizer.sequences_to_matrix(X, mode='binary')
for train, test in kfold.split(X, y):
X_train, X_test = X[train], X[test]
y_train, y_test = y[train], y[test]
print(X_train[0:10])
But I get the error:
TypeError: '>=' not supported between instances of 'str' and 'int'
Edit 2: Tried an approach described here
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
X = df.iloc[:,[0]].values
y = df.iloc[:,-1].values
kf = kfold.get_n_splits(X)
cvscores = []
le = LabelEncoder()
Y = le.fit_transform(y)
Y = Y.reshape(-1,1)
max_words = 1000
max_len = 150
for train, test in kfold.split(X, Y):
X_train, X_test = X[train], X[test]
y_train, y_test = y[train], y[test]
tok = Tokenizer(num_words=max_words)
tok.fit_on_texts(X_train)
sequences = tok.texts_to_sequences(X_train)
With the error:
AttributeError: 'numpy.ndarray' object has no attribute 'lower'