This commit is contained in:
poka 2023-01-04 04:02:19 +01:00
commit 3126f289ff
5 changed files with 49225 additions and 0 deletions

2
.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
__pycache__

48977
data/stackoverflow.csv Normal file

File diff suppressed because one or more lines are too long

86
functions.py Normal file
View File

@ -0,0 +1,86 @@
import torch
import numpy as np
import scipy
# Define the hyperparameters
num_layers = 2
batch_size = 32
hidden_dim = 256
def random_rotation(inputs):
angle = np.random.uniform(-180, 180)
inputs = scipy.ndimage.rotate(inputs, angle, reshape=False)
return inputs
def random_scaling(inputs):
scale = np.random.uniform(0.8, 1.2)
inputs = scipy.ndimage.zoom(inputs, scale)
return inputs
def random_translation(inputs):
shift = np.random.uniform(-0.2, 0.2)
inputs = scipy.ndimage.shift(inputs, shift)
return inputs
def random_shearing(inputs):
shear = np.random.uniform(-0.2, 0.2)
inputs = scipy.ndimage.shear(inputs, shear)
return inputs
def random_flipping(inputs):
inputs = scipy.ndimage.flip(inputs, axis=1)
return inputs
def data_augmentation(inputs):
# Apply random rotation
inputs = random_rotation(inputs)
# Apply random scaling
inputs = random_scaling(inputs)
# Apply random translation
inputs = random_translation(inputs)
# Apply random shearing
inputs = random_shearing(inputs)
# Apply random flipping
inputs = random_flipping(inputs)
return inputs
def evaluate(model, test_data, hyperparameters, recurrent_network=False, pre_trained_model=False, fine_tuning=False):
# Use GPU for training if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# Define the hidden state
hidden = (torch.zeros(num_layers, batch_size, hidden_dim).to(device),
torch.zeros(num_layers, batch_size, hidden_dim).to(device))
model.eval()
with torch.no_grad():
correct = 0
total = 0
for data in test_data:
inputs, labels = data
# Use data augmentation
inputs = data_augmentation(inputs)
# Use GPU for training
inputs = inputs.to(device)
labels = labels.to(device)
# Use recurrent network
if recurrent_network:
outputs = model(inputs, hidden)
else:
outputs = model(inputs)
# Use pre-trained model
if pre_trained_model:
outputs = model.forward_from_pretrained(inputs)
# Use fine-tuning
if fine_tuning:
outputs = model.fine_tune(inputs, hyperparameters)
_, predicted = torch.max(outputs.data, 1)
total += labels.size(0)
correct += (predicted == labels).sum().item()
accuracy = 100 * correct / total
return accuracy
def adjust_learning_rate(optimizer, epoch):
"""Sets the learning rate to the initial LR decayed by 10 every 30 epochs"""
lr = 0.001 * (0.1 ** (epoch // 30))
for param_group in optimizer.param_groups:
param_group['lr'] = lr

83
main.py Normal file
View File

@ -0,0 +1,83 @@
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext import data
from gensim.corpora import WikiCorpus
from transformers import GPT2Tokenizer, GPT2Model
from functions import *
# Define the hyperparameters
num_layers = 2
batch_size = 32
hidden_dim = 256
# Load the GPT2 model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2Model.from_pretrained('gpt2')
# Load the data
wiki_corpus = WikiCorpus('enwiki-latest-pages-articles.xml.bz2')
stackoverflow_corpus = data.TabularDataset('stackoverflow.csv', format='csv', fields=['text'])
# Preprocess the data
wiki_data = [text for text in wiki_corpus]
stackoverflow_data = [text for text in stackoverflow_corpus]
# Convert the data to a format compatible with PyTorch
wiki_data = torch.tensor(wiki_data)
stackoverflow_data = torch.tensor(stackoverflow_data)
# Define the Adam optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001)
# Define the loss function
criterion = nn.CrossEntropyLoss()
# Train the model
num_epochs=10
labels = torch.tensor([0, 1, 1, 0, 0, 1, 0, 1, 0, 1])
def adjust_learning_rate(optimizer, epoch):
"""Sets the learning rate to the initial LR decayed by 10 every 30 epochs"""
lr = 0.001 * (0.1 ** (epoch // 30))
for param_group in optimizer.param_groups:
param_group['lr'] = lr
for epoch in range(num_epochs):
# Forward pass
outputs = model(wiki_data, stackoverflow_data)
# Calculate the loss
loss = criterion(outputs, labels)
# Backward pass
loss.backward()
# Update the parameters
optimizer.step()
# Reset the gradients
optimizer.zero_grad()
# Evaluate the model
accuracy = evaluate(model, wiki_data)
# Save the model weights and states
torch.save(model.state_dict(), 'model.pth')
# Adjust the learning rate
adjust_learning_rate(optimizer, epoch)
# Define the model
class GPT(nn.Module):
def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers):
super().__init__()
self.embedding = nn.Embedding(vocab_size, embedding_dim)
self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)
self.fc = nn.Linear(hidden_dim, vocab_size)
self.gpt2 = model
def forward(self, x):
# Embed the input
x = self.embedding(x)
# Pass through the GPT2 model
x = self.gpt2(x)
# Pass through the LSTM
x, _ = self.lstm(x)
# Pass through the fully connected layer
x = self.fc(x)
return x

77
main2.py Normal file
View File

@ -0,0 +1,77 @@
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext import data
from gensim.corpora import WikiCorpus
from transformers import GPT2Tokenizer, GPT2Model
from functions import *
# Define the model
class GPT(nn.Module):
def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers):
super().__init__()
self.embedding = nn.Embedding(vocab_size, embedding_dim)
self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)
self.fc = nn.Linear(hidden_dim, vocab_size)
self.gpt2 = model
def forward(self, x):
# Embed the input
x = self.embedding(x)
# Pass through the GPT2 model
x = self.gpt2(x)
# Pass through the LSTM
x, _ = self.lstm(x)
# Pass through the fully connected layer
x = self.fc(x)
return x
# Load the GPT2 model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2Model.from_pretrained('gpt2')
# Load the data
wiki_corpus_en = WikiCorpus('data/enwiki-latest-pages-articles.xml.bz2')
wiki_corpus_fr = WikiCorpus('data/frwiki-latest-pages-articles.xml.bz2')
# stackoverflow_corpus = data.TabularDataset('data/stackoverflow.csv', format='csv', fields=['text'])
# Preprocess the data
wiki_data_en = [text for text in wiki_corpus_en]
wiki_data_fr = [text for text in wiki_corpus_fr]
# stackoverflow_data = [text for text in stackoverflow_corpus]
# Convert the data to a format compatible with PyTorch
wiki_data_en = torch.tensor(wiki_data_en)
wiki_data_fr = torch.tensor(wiki_data_fr)
# stackoverflow_data = torch.tensor(stackoverflow_data)
# Define the Adam optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001)
# Define the loss function
criterion = nn.CrossEntropyLoss()
# Train the model
num_epochs=10
labels = torch.tensor([0, 1, 1, 0, 0, 1, 0, 1, 0, 1])
for epoch in range(num_epochs):
# Forward pass
# outputs = model(wiki_data, stackoverflow_data)
outputs = model(wiki_data_en, wiki_data_fr)
# Calculate the loss
loss = criterion(outputs, labels)
# Backward pass
loss.backward()
# Update the parameters
optimizer.step()
# Reset the gradients
optimizer.zero_grad()
# Evaluate the model
accuracy = evaluate(model, wiki_data_en)
# Save the model weights and states
torch.save(model.state_dict(), 'model.pth')
# Adjust the learning rate
adjust_learning_rate(optimizer, epoch)
# Print the loss and accuracy
print('Epoch: {}, Loss: {:.4f}, Accuracy: {:.4f}'.format(epoch+1, loss.item(), accuracy))