From 5b2e45c645213b56d530cf9e1f21b00f413ca50f Mon Sep 17 00:00:00 2001
From: poka <poka@p2p.legal>
Date: Wed, 4 Jan 2023 15:33:25 +0100
Subject: [PATCH] wip

---
 .gitignore |  2 +-
 main2.py   | 52 ++++++++++++++++++++++++++++++----------------------
 2 files changed, 31 insertions(+), 23 deletions(-)

diff --git a/.gitignore b/.gitignore
index c20c2ab..1714277 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,2 @@
 __pycache__
-
+data/
\ No newline at end of file
diff --git a/main2.py b/main2.py
index 81dae95..c236d52 100644
--- a/main2.py
+++ b/main2.py
@@ -7,58 +7,66 @@ from transformers import GPT2Tokenizer, GPT2Model
 from functions import *
 
 # Define the model
-class GPT(nn.Module):
-    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers):
-        super().__init__()
-        self.embedding = nn.Embedding(vocab_size, embedding_dim)
-        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)
-        self.fc = nn.Linear(hidden_dim, vocab_size)
-        self.gpt2 = model
+# class GPT(nn.Module):
+#     def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers):
+#         super().__init__()
+#         self.embedding = nn.Embedding(vocab_size, embedding_dim)
+#         self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)
+#         self.fc = nn.Linear(hidden_dim, vocab_size)
+#         self.gpt2 = model
     
-    def forward(self, x):
-        # Embed the input
-        x = self.embedding(x)
-        # Pass through the GPT2 model
-        x = self.gpt2(x)
-        # Pass through the LSTM
-        x, _ = self.lstm(x)
-        # Pass through the fully connected layer
-        x = self.fc(x)
-        return x
+#     def forward(self, x):
+#         # Embed the input
+#         x = self.embedding(x)
+#         # Pass through the GPT2 model
+#         x = self.gpt2(x)
+#         # Pass through the LSTM
+#         x, _ = self.lstm(x)
+#         # Pass through the fully connected layer
+#         x = self.fc(x)
+#         return x
 
 # Load the GPT2 model
+print('load gpt2 model')
 tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
 model = GPT2Model.from_pretrained('gpt2')
 
 # Load the data
-wiki_corpus_en = WikiCorpus('data/enwiki-latest-pages-articles.xml.bz2')
+print('load custom data')
+# wiki_corpus_en = WikiCorpus('data/enwiki-latest-pages-articles.xml.bz2')
 wiki_corpus_fr = WikiCorpus('data/frwiki-latest-pages-articles.xml.bz2')
 # stackoverflow_corpus = data.TabularDataset('data/stackoverflow.csv', format='csv', fields=['text'])
 
 # Preprocess the data
-wiki_data_en = [text for text in wiki_corpus_en]
+print('Preprocess the data')
+# wiki_data_en = [text for text in wiki_corpus_en]
 wiki_data_fr = [text for text in wiki_corpus_fr]
 # stackoverflow_data = [text for text in stackoverflow_corpus]
 
 # Convert the data to a format compatible with PyTorch
-wiki_data_en = torch.tensor(wiki_data_en)
+print('Convert the data to a format compatible with PyTorch')
+# wiki_data_en = torch.tensor(wiki_data_en)
 wiki_data_fr = torch.tensor(wiki_data_fr)
 # stackoverflow_data = torch.tensor(stackoverflow_data)
 
 # Define the Adam optimizer
+print('Define the Adam optimizer')
 optimizer = optim.Adam(model.parameters(), lr=0.001)
 
 # Define the loss function
+print('Define the loss function')
 criterion = nn.CrossEntropyLoss()
 
 # Train the model
+print('Train the model')
 num_epochs=10
 labels = torch.tensor([0, 1, 1, 0, 0, 1, 0, 1, 0, 1])
 
 for epoch in range(num_epochs):
+    print('epoch: ' + epoch)
     # Forward pass
     # outputs = model(wiki_data, stackoverflow_data)
-    outputs = model(wiki_data_en, wiki_data_fr)
+    outputs = model(wiki_data_fr)
     # Calculate the loss
     loss = criterion(outputs, labels)
     # Backward pass
@@ -68,7 +76,7 @@ for epoch in range(num_epochs):
     # Reset the gradients
     optimizer.zero_grad()
     # Evaluate the model
-    accuracy = evaluate(model, wiki_data_en)
+    accuracy = evaluate(model, wiki_data_fr)
     # Save the model weights and states
     torch.save(model.state_dict(), 'model.pth')
     # Adjust the learning rate