Keras'ta bir chatbot yapmaya çalışıyorum. Kelime bilgindeki her kelimeyi kendi kimliğime atadım. Bir eğitim numune şuna benzer: keras kelimesinin tersine çeviri yapılması - python
[0 0 0 0 0 0 32 328 2839 13 192 1 ] -> [23 3289 328 2318 12 0 0 0 0 0 0 0]
[ 0.16102183 0.1238187 0.1159694 0.13688719 0.12964118 0.12848872 0.13515817 0.13582146 0.16919741 0.15453722 ... ]
nasıl orijinal kelime kelime için siz de bu katıştırmalarını dönüştürebilir?
İşte benim kodudur:
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from keras.models import Sequential, load_model
from keras.layers import LSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
import os
import numpy as np
import cPickle as pickle
class Chatbot(object):
def __init__(self, h_layers=1):
# self.name = name
self.h_layers = h_layers
self.seq2seq = None
self.max_length = 0
self.vocabulary = {}
@staticmethod
def load(model_name):
with open('models/{}/chatbot_object.pkl'.format(model_name), 'rb') as pickle_file:
obj = pickle.load(pickle_file)
obj.seq2seq = load_model('models/{}/seq2seq.h5'.format(model_name))
return obj
def train(self, x_train, y_train):
count_vect = CountVectorizer()
count_vect.fit(x_train)
count_vect.fit(y_train)
self.vocabulary = count_vect.vocabulary_
self.vocabulary.update({'<START>': len(self.vocabulary),
'<END>': len(self.vocabulary) + 1,
'<PAD>': len(self.vocabulary) + 2,
'<UNK>': len(self.vocabulary) + 3})
for i in range(len(x_train)):
x_train[i] = ['<START>'] + [w.lower() for w in word_tokenize(x_train[i])] + ['<END>']
for i in range(len(y_train)):
y_train[i] = ['<START>'] + [w.lower() for w in word_tokenize(y_train[i])] + ['<END>']
for sample in x_train:
if len(sample) > self.max_length:
self.max_length = len(sample)
for sample in y_train:
if len(sample) > self.max_length:
self.max_length = len(sample)
for i in range(len(x_train)):
x_train[i] = [self.vocabulary[w] for w in x_train[i] if w in self.vocabulary]
for i in range(len(y_train)):
y_train[i] = [self.vocabulary[w] for w in y_train[i] if w in self.vocabulary]
x_train = sequence.pad_sequences(x_train, maxlen=self.max_length, value=self.vocabulary['<PAD>'])
y_train = sequence.pad_sequences(y_train, maxlen=self.max_length, padding='post',
value=self.vocabulary['<PAD>'])
x_train = np.asarray(x_train)
y_train = np.asarray(y_train)
embedding_vector_length = 32
self.seq2seq = Sequential()
self.seq2seq.add(Embedding(len(self.vocabulary), embedding_vector_length, input_length=self.max_length))
for _ in range(self.h_layers):
self.seq2seq.add(LSTM(self.max_length, return_sequences=True))
self.seq2seq.add(LSTM(self.max_length))
self.seq2seq.compile(loss='cosine_proximity', optimizer='adam', metrics=['accuracy'])
self.seq2seq.fit(x_train[:100], y_train[:100], epochs=5, batch_size=32)
def save(self, filename):
if filename not in os.listdir('models'):
os.system('mkdir models/{}'.format(filename))
self.seq2seq.save('models/{}/seq2seq.h5'.format(filename))
self.seq2seq = None
with open('models/{}/chatbot_object.pkl'.format(filename), 'wb') as pickle_file:
pickle.dump(self, pickle_file)
def respond(self, text):
tokens = ['<START>'] + [w.lower() for w in word_tokenize(text)] + ['<END>']
for i in range(len(tokens)):
if tokens[i] in self.vocabulary:
tokens[i] = self.vocabulary[tokens[i]]
else:
tokens[i] = self.vocabulary['<PAD>']
x = sequence.pad_sequences([tokens], maxlen=self.max_length, value=self.vocabulary['<PAD>'])
prediction = self.seq2seq.predict(x, batch_size=1)
return prediction[0]