add python code

2025-05-01 10:03:09 +02:00
parent 4c1207b57e
commit de6cc17ce7
2 changed files with 249 additions and 0 deletions
--- a/bengio.py
+++ b/bengio.py
@@ -0,0 +1,248 @@
 import sys
 import random
 import numpy as np
 class NeuralProbabilisticLanguageModel:
    def __init__(self, vocab_size, embedding_dim=100, context_size=3, hidden_dim=50, learning_rate=0.01):
        """
        Initialize Bengio's Neural Probabilistic Language Model
        Args:
            vocab_size: Size of the vocabulary
            embedding_dim: Dimension of word embeddings
            context_size: Number of previous words to consider
            hidden_dim: Dimension of hidden layer
            learning_rate: Learning rate for optimization
        """
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.context_size = context_size
        self.hidden_dim = hidden_dim
        self.learning_rate = learning_rate
        # Initialize model parameters
        # C: Word embedding matrix
        self.C = np.random.randn(vocab_size, embedding_dim) * 0.1
        # Parameters for hidden layer
        self.H = np.random.randn(context_size * embedding_dim, hidden_dim) * 0.1
        self.b_h = np.zeros(hidden_dim)
        # Parameters for output layer
        self.U = np.random.randn(hidden_dim, vocab_size) * 0.1
        self.W = np.random.randn(context_size * embedding_dim, vocab_size) * 0.1
        self.b_o = np.zeros(vocab_size)
    def forward(self, context_words):
        """
        Forward pass of the model
        Args:
            context_words: List of word indices for the context
        Returns:
            Probability distribution over next words
        """
        # Lookup embeddings for context words
        embeddings = self.C[context_words]
        x = embeddings.flatten()  # Concatenate embeddings
        # Compute hidden layer activation
        h = np.tanh(np.dot(x, self.H) + self.b_h)
        # Compute output layer
        # Direct connections from input to output (shortcut connections)
        y = np.dot(h, self.U) + np.dot(x, self.W) + self.b_o
        # Apply softmax to get probabilities
        exp_y = np.exp(y - np.max(y))  # Subtract max for numerical stability
        probabilities = exp_y / np.sum(exp_y)
        return probabilities, h, x
    def compute_loss(self, probabilities, target_word):
        """
        Compute cross-entropy loss
        Args:
            probabilities: Predicted probability distribution
            target_word: Index of the target word
        Returns:
            Cross-entropy loss
        """
        return -np.log(probabilities[target_word])
    def backward(self, context_words, target_word, probabilities, h, x):
        """
        Backward pass for parameter updates
        Args:
            context_words: List of word indices for the context
            target_word: Index of the target word
            probabilities: Output probabilities from forward pass
            h: Hidden layer activation
            x: Input vector (concatenated embeddings)
        """
        # Gradient for output layer
        d_y = probabilities.copy()
        d_y[target_word] -= 1
        # Gradients for parameters
        d_U = np.outer(h, d_y)
        d_W = np.outer(x, d_y)
        d_b_o = d_y
        # Gradient for hidden layer
        d_h = np.dot(d_y, self.U.T)
        d_h_input = d_h * (1 - h**2)  # Derivative of tanh
        d_H = np.outer(x, d_h_input)
        d_b_h = d_h_input
        # Gradient for embeddings
        d_x = np.dot(d_h_input, self.H.T) + np.dot(d_y, self.W.T)
        d_C = np.zeros_like(self.C)
        # Update embeddings for context words
        for i, word_idx in enumerate(context_words):
            start = i * self.embedding_dim
            end = (i + 1) * self.embedding_dim
            d_C[word_idx] += d_x[start:end]
        # Update parameters
        self.U -= self.learning_rate * d_U
        self.W -= self.learning_rate * d_W
        self.b_o -= self.learning_rate * d_b_o
        self.H -= self.learning_rate * d_H
        self.b_h -= self.learning_rate * d_b_h
        self.C -= self.learning_rate * d_C
    def train_step(self, context_words, target_word):
        """
        Perform one training step
        Args:
            context_words: List of word indices for the context
            target_word: Index of the target word
        Returns:
            Loss for this example
        """
        probabilities, h, x = self.forward(context_words)
        loss = self.compute_loss(probabilities, target_word)
        self.backward(context_words, target_word, probabilities, h, x)
        return loss
    def train(self, data, n_epochs=5):
        """
        Train the model on a dataset
        Args:
            data: List of (context_words, target_word) tuples
            n_epochs: Number of training epochs
        Returns:
            List of average losses per epoch
        """
        losses = []
        for epoch in range(n_epochs):
            epoch_loss = 0
            for context_words, target_word in data:
                epoch_loss += self.train_step(context_words, target_word)
            avg_loss = epoch_loss / len(data)
            losses.append(avg_loss)
            print(f"Epoch {epoch+1}/{n_epochs}, Loss: {avg_loss:.4f}")
        return losses
    def predict_next_word(self, context_words, temperature=1.0):
        """
        Predict the next word given a context with temperature sampling
        Args:
            context_words: List of word indices for the context
            temperature: Controls randomness (higher = more random, lower = more deterministic)
                         temperature=0 is equivalent to argmax (greedy)
                         temperature=1.0 keeps the original distribution
        Returns:
            Index of the sampled next word
        """
        probabilities, _, _ = self.forward(context_words)
        if temperature == 0:
            # Greedy sampling (argmax)
            return np.argmax(probabilities)
        # Apply temperature scaling
        scaled_logits = np.log(probabilities) / temperature
        # Re-normalize to get a valid probability distribution
        exp_scaled = np.exp(scaled_logits - np.max(scaled_logits))  # Subtract max for numerical stability
        scaled_probs = exp_scaled / np.sum(exp_scaled)
        # Sample from the scaled distribution
        return np.random.choice(len(scaled_probs), p=scaled_probs)
 # Example usage
 def preprocess_text(text, vocab, context_size):
    """Convert text to training examples"""
    words = text.split()
    word_to_idx = {word: idx for idx, word in enumerate(vocab)}
    # Create training examples
    examples = []
    for i in range(len(words) - context_size):
        context = [word_to_idx[words[i+j]] for j in range(context_size)]
        target = word_to_idx[words[i+context_size]]
        examples.append((context, target))
    return examples
 # Small example
 if __name__ == "__main__":
    if len(sys.argv) < 2:
        print("synopsis: python3 main.py <file> <n_ctx> <n_epochs>")
        exit(1)
    with open(sys.argv[1], "r") as buf:
        text = buf.read()
    if len(sys.argv) > 2:
        n_ctx = int(sys.argv[2])
    else:
        n_ctx = 2
    if len(sys.argv) > 3:
        n_epochs = int(sys.argv[3])
    else:
        n_epochs = 10
    n_predict = 100
    words = text.split()
    vocab = sorted(set(words))
    vocab.append(" ")
    examples = preprocess_text(text, vocab, n_ctx)
    model = NeuralProbabilisticLanguageModel(
        vocab_size=len(vocab),
        embedding_dim=10,
        context_size=n_ctx,
        hidden_dim=8
    )
    losses = model.train(examples, n_epochs=n_epochs)
    # Test model prediction
    pred = [vocab.index(w) for w in words[:n_ctx]]
    for i in range(n_predict):
        context = pred[-n_ctx:]
        predicted_idx = model.predict_next_word(context)
        pred.append(predicted_idx)
    output = [vocab[i] for i in pred]
    print(" ".join(output))
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1 @@
 numpy