From 5b2e45c645213b56d530cf9e1f21b00f413ca50f Mon Sep 17 00:00:00 2001 From: poka Date: Wed, 4 Jan 2023 15:33:25 +0100 Subject: [PATCH] wip --- .gitignore | 2 +- main2.py | 52 ++++++++++++++++++++++++++++++---------------------- 2 files changed, 31 insertions(+), 23 deletions(-) diff --git a/.gitignore b/.gitignore index c20c2ab..1714277 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,2 @@ __pycache__ - +data/ \ No newline at end of file diff --git a/main2.py b/main2.py index 81dae95..c236d52 100644 --- a/main2.py +++ b/main2.py @@ -7,58 +7,66 @@ from transformers import GPT2Tokenizer, GPT2Model from functions import * # Define the model -class GPT(nn.Module): - def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers): - super().__init__() - self.embedding = nn.Embedding(vocab_size, embedding_dim) - self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True) - self.fc = nn.Linear(hidden_dim, vocab_size) - self.gpt2 = model +# class GPT(nn.Module): +# def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers): +# super().__init__() +# self.embedding = nn.Embedding(vocab_size, embedding_dim) +# self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True) +# self.fc = nn.Linear(hidden_dim, vocab_size) +# self.gpt2 = model - def forward(self, x): - # Embed the input - x = self.embedding(x) - # Pass through the GPT2 model - x = self.gpt2(x) - # Pass through the LSTM - x, _ = self.lstm(x) - # Pass through the fully connected layer - x = self.fc(x) - return x +# def forward(self, x): +# # Embed the input +# x = self.embedding(x) +# # Pass through the GPT2 model +# x = self.gpt2(x) +# # Pass through the LSTM +# x, _ = self.lstm(x) +# # Pass through the fully connected layer +# x = self.fc(x) +# return x # Load the GPT2 model +print('load gpt2 model') tokenizer = GPT2Tokenizer.from_pretrained('gpt2') model = GPT2Model.from_pretrained('gpt2') # Load the data -wiki_corpus_en = WikiCorpus('data/enwiki-latest-pages-articles.xml.bz2') +print('load custom data') +# wiki_corpus_en = WikiCorpus('data/enwiki-latest-pages-articles.xml.bz2') wiki_corpus_fr = WikiCorpus('data/frwiki-latest-pages-articles.xml.bz2') # stackoverflow_corpus = data.TabularDataset('data/stackoverflow.csv', format='csv', fields=['text']) # Preprocess the data -wiki_data_en = [text for text in wiki_corpus_en] +print('Preprocess the data') +# wiki_data_en = [text for text in wiki_corpus_en] wiki_data_fr = [text for text in wiki_corpus_fr] # stackoverflow_data = [text for text in stackoverflow_corpus] # Convert the data to a format compatible with PyTorch -wiki_data_en = torch.tensor(wiki_data_en) +print('Convert the data to a format compatible with PyTorch') +# wiki_data_en = torch.tensor(wiki_data_en) wiki_data_fr = torch.tensor(wiki_data_fr) # stackoverflow_data = torch.tensor(stackoverflow_data) # Define the Adam optimizer +print('Define the Adam optimizer') optimizer = optim.Adam(model.parameters(), lr=0.001) # Define the loss function +print('Define the loss function') criterion = nn.CrossEntropyLoss() # Train the model +print('Train the model') num_epochs=10 labels = torch.tensor([0, 1, 1, 0, 0, 1, 0, 1, 0, 1]) for epoch in range(num_epochs): + print('epoch: ' + epoch) # Forward pass # outputs = model(wiki_data, stackoverflow_data) - outputs = model(wiki_data_en, wiki_data_fr) + outputs = model(wiki_data_fr) # Calculate the loss loss = criterion(outputs, labels) # Backward pass @@ -68,7 +76,7 @@ for epoch in range(num_epochs): # Reset the gradients optimizer.zero_grad() # Evaluate the model - accuracy = evaluate(model, wiki_data_en) + accuracy = evaluate(model, wiki_data_fr) # Save the model weights and states torch.save(model.state_dict(), 'model.pth') # Adjust the learning rate