wip

2023-01-04 15:33:25 +01:00 · 2023-01-04 15:33:25 +01:00 · 5b2e45c645
parent 3126f289ff
commit 5b2e45c645
2 changed files with 31 additions and 23 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,2 +1,2 @@
 __pycache__
-
+data/
--- a/main2.py
+++ b/main2.py
@ -7,58 +7,66 @@ from transformers import GPT2Tokenizer, GPT2Model
 from functions import *

 # Define the model
-class GPT(nn.Module):
-    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers):
-        super().__init__()
-        self.embedding = nn.Embedding(vocab_size, embedding_dim)
-        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)
-        self.fc = nn.Linear(hidden_dim, vocab_size)
-        self.gpt2 = model
+# class GPT(nn.Module):
+#     def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers):
+#         super().__init__()
+#         self.embedding = nn.Embedding(vocab_size, embedding_dim)
+#         self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)
+#         self.fc = nn.Linear(hidden_dim, vocab_size)
+#         self.gpt2 = model
    
-    def forward(self, x):
-        # Embed the input
-        x = self.embedding(x)
-        # Pass through the GPT2 model
-        x = self.gpt2(x)
-        # Pass through the LSTM
-        x, _ = self.lstm(x)
-        # Pass through the fully connected layer
-        x = self.fc(x)
-        return x
+#     def forward(self, x):
+#         # Embed the input
+#         x = self.embedding(x)
+#         # Pass through the GPT2 model
+#         x = self.gpt2(x)
+#         # Pass through the LSTM
+#         x, _ = self.lstm(x)
+#         # Pass through the fully connected layer
+#         x = self.fc(x)
+#         return x

 # Load the GPT2 model
+print('load gpt2 model')
 tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
 model = GPT2Model.from_pretrained('gpt2')

 # Load the data
-wiki_corpus_en = WikiCorpus('data/enwiki-latest-pages-articles.xml.bz2')
+print('load custom data')
+# wiki_corpus_en = WikiCorpus('data/enwiki-latest-pages-articles.xml.bz2')
 wiki_corpus_fr = WikiCorpus('data/frwiki-latest-pages-articles.xml.bz2')
 # stackoverflow_corpus = data.TabularDataset('data/stackoverflow.csv', format='csv', fields=['text'])

 # Preprocess the data
-wiki_data_en = [text for text in wiki_corpus_en]
+print('Preprocess the data')
+# wiki_data_en = [text for text in wiki_corpus_en]
 wiki_data_fr = [text for text in wiki_corpus_fr]
 # stackoverflow_data = [text for text in stackoverflow_corpus]

 # Convert the data to a format compatible with PyTorch
-wiki_data_en = torch.tensor(wiki_data_en)
+print('Convert the data to a format compatible with PyTorch')
+# wiki_data_en = torch.tensor(wiki_data_en)
 wiki_data_fr = torch.tensor(wiki_data_fr)
 # stackoverflow_data = torch.tensor(stackoverflow_data)

 # Define the Adam optimizer
+print('Define the Adam optimizer')
 optimizer = optim.Adam(model.parameters(), lr=0.001)

 # Define the loss function
+print('Define the loss function')
 criterion = nn.CrossEntropyLoss()

 # Train the model
+print('Train the model')
 num_epochs=10
 labels = torch.tensor([0, 1, 1, 0, 0, 1, 0, 1, 0, 1])

 for epoch in range(num_epochs):
+    print('epoch: ' + epoch)
    # Forward pass
    # outputs = model(wiki_data, stackoverflow_data)
-    outputs = model(wiki_data_en, wiki_data_fr)
+    outputs = model(wiki_data_fr)
    # Calculate the loss
    loss = criterion(outputs, labels)
    # Backward pass
@ -68,7 +76,7 @@ for epoch in range(num_epochs):
    # Reset the gradients
    optimizer.zero_grad()
    # Evaluate the model
-    accuracy = evaluate(model, wiki_data_en)
+    accuracy = evaluate(model, wiki_data_fr)
    # Save the model weights and states
    torch.save(model.state_dict(), 'model.pth')
    # Adjust the learning rate