File size: 5,096 Bytes
5a20c96
a18e93d
 
5a20c96
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a18e93d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
import pytest
import torch

from src.models.encoder import TransformerEncoder


def test_encoder_token_ids_and_padding_mask_and_grad():
    """
    Test using token ids as input, automatic padding mask creation when pad_token_id is provided,
    output shape, and that gradients flow through the model.
    """
    torch.manual_seed(0)
    vocab_size = 50
    pad_token_id = 0
    d_model = 64
    num_layers = 3
    num_heads = 8
    d_ff = 128
    batch_size = 2
    seq_len = 12

    encoder = TransformerEncoder(
        vocab_size=vocab_size,
        d_model=d_model,
        num_layers=num_layers,
        num_heads=num_heads,
        d_ff=d_ff,
        dropout=0.1,
        max_len=seq_len,
        pad_token_id=pad_token_id,
    )

    # create inputs with some padding at the end
    input_ids = torch.randint(1, vocab_size, (batch_size, seq_len), dtype=torch.long)
    input_ids[0, -3:] = pad_token_id  # first sample has last 3 tokens as padding
    input_ids[1, -1:] = pad_token_id  # second sample has last token as padding

    # Forward pass (token ids)
    out = encoder(input_ids)  # default collect_attn=False
    assert out.shape == (batch_size, seq_len, d_model)

    # Check gradients flow
    loss = out.sum()
    loss.backward()
    grads = [p.grad for p in encoder.parameters() if p.requires_grad]
    assert any(g is not None for g in grads), "No gradients found on any parameter"


def test_encoder_embeddings_input_and_collect_attn():
    """
    Test passing pre-computed embeddings to the encoder, collecting attention weights,
    and verify shapes of attention maps per layer.
    """
    torch.manual_seed(1)
    vocab_size = 100  # not used in this test
    d_model = 48
    num_layers = 4
    num_heads = 6
    d_ff = 128
    batch_size = 1
    seq_len = 10

    encoder = TransformerEncoder(
        vocab_size=vocab_size,
        d_model=d_model,
        num_layers=num_layers,
        num_heads=num_heads,
        d_ff=d_ff,
        dropout=0.0,
        max_len=seq_len,
        pad_token_id=None,
    )

    # Create random embeddings directly
    embeddings = torch.randn(batch_size, seq_len, d_model)

    out, attn_list = encoder(embeddings, mask=None, collect_attn=True)
    assert out.shape == (batch_size, seq_len, d_model)
    assert isinstance(attn_list, list)
    assert len(attn_list) == num_layers

    # Each attention weight tensor should have shape (batch, num_heads, seq, seq)
    for attn in attn_list:
        assert attn.shape == (batch_size, num_heads, seq_len, seq_len)


def test_mask_accepts_3d_and_4d_and_broadcasts():
    """
    Test that a provided 3D mask (batch, seq, seq) and an equivalent 4D mask
    (batch, 1, seq, seq) produce outputs of the same shape and do not error.
    """
    torch.manual_seed(2)
    vocab_size = 40
    d_model = 32
    num_layers = 2
    num_heads = 4
    d_ff = 64
    batch_size = 2
    seq_len = 7

    encoder = TransformerEncoder(
        vocab_size=vocab_size,
        d_model=d_model,
        num_layers=num_layers,
        num_heads=num_heads,
        d_ff=d_ff,
        dropout=0.0,
        max_len=seq_len,
        pad_token_id=None,
    )

    # Create dummy embeddings
    embeddings = torch.randn(batch_size, seq_len, d_model)

    # 3D mask: True indicates allowed attention
    mask3 = torch.ones(batch_size, seq_len, seq_len, dtype=torch.bool)
    mask3[:, :, -2:] = False  # mask out last two keys

    # 4D mask equivalent
    mask4 = mask3.unsqueeze(1)  # (B, 1, S, S)

    out3 = encoder(embeddings, mask=mask3)
    out4 = encoder(embeddings, mask=mask4)

    assert out3.shape == (batch_size, seq_len, d_model)
    assert out4.shape == (batch_size, seq_len, d_model)
    # Outputs should be finite and not NaN
    assert torch.isfinite(out3).all()
    assert torch.isfinite(out4).all()


def test_train_eval_determinism_and_dropout_effect():
    """
    Validate that in train mode with dropout enabled, repeated forwards differ,
    and in eval mode they are equal (deterministic).
    """
    torch.manual_seed(3)
    vocab_size = 60
    pad_token_id = 0
    d_model = 64
    num_layers = 2
    num_heads = 8
    d_ff = 128
    batch_size = 2
    seq_len = 9

    encoder = TransformerEncoder(
        vocab_size=vocab_size,
        d_model=d_model,
        num_layers=num_layers,
        num_heads=num_heads,
        d_ff=d_ff,
        dropout=0.4,
        max_len=seq_len,
        pad_token_id=pad_token_id,
    )

    # token ids with occasional padding
    input_ids = torch.randint(1, vocab_size, (batch_size, seq_len), dtype=torch.long)
    input_ids[0, -2:] = pad_token_id

    # Training mode: randomness due to dropout -> outputs should likely differ
    encoder.train()
    out1 = encoder(input_ids)
    out2 = encoder(input_ids)
    assert not torch.allclose(out1, out2), "Outputs identical in train mode despite dropout"

    # Eval mode: deterministic
    encoder.eval()
    out3 = encoder(input_ids)
    out4 = encoder(input_ids)
    assert torch.allclose(out3, out4), "Outputs differ in eval mode"


if __name__ == "__main__":
    pytest.main([__file__, "-q"])