text = "The animal didn't cross the street because it was too tired"
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(f"chars : {''.join(chars)}")
print(f"vocab_size : {vocab_size}")chars : 'Tabcdehilmnorstuw
vocab_size : 19import torch
import torch.nn as nn
# 문자를 인덱스로, 인덱스를 문자로 매핑하는 사전 생성
**char_to_index = {char: index for index, char in enumerate(chars)}
index_to_char = {index: char for index, char in enumerate(chars)}**
# Embedding layer 생성 (임베딩 차원을 예로 vocab_size의 절반으로 설정)
embedding_dim = 4 # 혹은 원하는 다른 차원 수
**token_embedding_table = nn.Embedding(vocab_size, embedding_dim)**
# 전체 텍스트를 인덱스로 변환하여 임베딩 조회
text_indices = torch.tensor([char_to_index[char] for char in text])
**text_embedding = token_embedding_table(text_indices)**
# 결과
print(f"text_embedding.shape : {text_embedding.shape}")
print(f"text_embedding : \n {text_embedding[:10]}")text_embedding.shape : torch.Size([59, 4])
text_embedding :
tensor([[ 0.6686, -0.4579, 0.5651, 0.5184],
[-1.1360, 0.3221, 0.0946, -1.4244],
[ 0.6121, 1.3113, 0.4926, 0.4148],
[-1.3359, 0.4271, 0.9899, 1.0274],
[-0.1111, 0.2487, 2.7271, -1.7861],
[ 0.1996, -1.5881, 0.5240, -0.4852],
[ 1.7475, -1.3174, -1.9797, -0.2429],
[-0.0726, 0.2717, -1.5650, -0.1542],
[-0.1111, 0.2487, 2.7271, -1.7861],
[-0.9550, 0.8071, -1.2684, -1.8388]], grad_fn=<SliceBackward0>)class Head(nn.Module):
""" one head of self-attention """
def __init__(self, head_size):
super().__init__()
self.key = nn.Linear(n_embd, head_size, bias=False) # weight matrix
self.query = nn.Linear(n_embd, head_size, bias=False) # weight matrix
self.value = nn.Linear(n_embd, head_size, bias=False) # weight matrix
self.dropout = nn.Dropout(dropout)
def forward(self, x):
B,T,C = x.shape
k = self.key(x) # (B,T,C)
q = self.query(x) # (B,T,C)
# compute attention scores ("affinities")
**wei = q @ k.transpose(-2,-1) * C**-0.5 # (B, T, C) @ (B, C, T) -> (B, T, T)
wei = F.softmax(wei, dim=-1) # (B, T, T)**
wei = self.dropout(wei)
# perform the weighted aggregation of the values
v = self.value(x) # (B,T,C)
**out = wei @ v # (B, T, T) @ (B, T, C) -> (B, T, C)**
return outclass MultiHeadAttention(nn.Module):
""" multiple heads of self-attention in parallel """
def __init__(self, num_heads, head_size):
super().__init__()
**self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
self.proj = nn.Linear(n_embd, n_embd)**
self.dropout = nn.Dropout(dropout)
def forward(self, x):
**out = torch.cat([h(x) for h in self.heads], dim=-1)**
out = self.dropout(**self.proj(out)**)
return outclass Head(nn.Module):
""" one head of self-attention """
def __init__(self, head_size):
super().__init__()
...
self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
...
def forward(self, x):
...
wei = q @ k.transpose(-2,-1) * C**-0.5 # (B, T, C) @ (B, C, T) -> (B, T, T)
**wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)**
wei = F.softmax(wei, dim=-1) # (B, T, T)
wei = self.dropout(wei)
**** # perform the weighted aggregation of the values
v = self.value(x) # (B,T,C)
out = wei @ v # (B, T, T) @ (B, T, C) -> (B, T, C)
return out
# read it in to inspect it
with open('input.txt', 'r', encoding='utf-8') as f:
text = f.read()First Citizen:
Before we proceed any further, hear me speak.
All:
Speak, speak.
First Citizen:
You are all resolved rather to die than to famish?
All:
Resolved. resolved.
First Citizen:
First, you know Caius Marcius is chief enemy to the people.
...chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)!&',-.:;?ABCDEFGHIJKLMNOPQRSTUVWYabcdefghijklmnopqrstuvwxyz
61# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string
print(f"encode('hii there') : {encode('hii there')}")
print(f"decode(encode('hii there')) : {decode(encode('hii there'))}")encode('hii there') : [8, 9, 9, 0, 16, 8, 7, 14, 7]
decode(encode('hii there')) : hii therex = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
context = x[:t+1]
target = y[t]
print(f"when input is {context} the target: {target}")when input is tensor([788]) the target: 149
when input is tensor([788, 149]) the target: 140
when input is tensor([788, 149, 140]) the target: 1
when input is tensor([788, 149, 140, 1]) the target: 726
when input is tensor([788, 149, 140, 1, 726]) the target: 370
when input is tensor([788, 149, 140, 1, 726, 370]) the target: 680
when input is tensor([788, 149, 140, 1, 726, 370, 680]) the target: 996
when input is tensor([788, 149, 140, 1, 726, 370, 680, 996]) the target: 6import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)
class BigramLanguageModel(nn.Module):
def __init__(self, vocab_size):
super().__init__()
# each token directly reads off the logits for the next token from a lookup table
self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
def forward(self, idx, targets=None):
# idx and targets are both (B,T) tensor of integers
logits = self.token_embedding_table(idx) # (B,T,C)
if targets is None:
loss = None
else:
B, T, C = logits.shape
logits = logits.view(B*T, C)
targets = targets.view(B*T)
loss = F.cross_entropy(logits, targets)
return logits, loss
def generate(self, idx, max_new_tokens):
# idx is (B, T) array of indices in the current context
for _ in range(max_new_tokens):
# get the predictions
logits, loss = self(idx)
**# focus only on the last time step**
logits = logits[:, -1, :] # becomes (B, C)
# apply softmax to get probabilities
probs = F.softmax(logits, dim=-1) # (B, C)
# sample from the distribution
idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
# append sampled index to the running sequence
idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
return idx
m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)
print(decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=100)[0].tolist()))torch.Size([256, 19])
tensor(2.9300, grad_fn=<NllLossBackward0>
du u ts'iwonabbbbbacbbbbu clsbbsohdsbbbbnteln'nlbbssbnlblnnnlcrlconnuaacrmnnTtoiiocrnilwdmuu tolnnnibatch_size = 32
for steps in range(100): # increase number of steps for good results...
# sample a batch of data
xb, yb = get_batch('train')
# evaluate the loss
logits, loss = m(xb, yb)
optimizer.zero_grad(set_to_none=True)
loss.backward()
optimizer.step()# hyperparameters
batch_size = 16 # how many independent sequences will we process in parallel?
block_size = 32 # what is the maximum context length for predictions?
max_iters = 5000
eval_interval = 100
learning_rate = 1e-3
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
dropout = 0.0
n_embd = 64 # embedding 차원
n_head = 4 # head number
n_layer = 4 # Decoder block number# super simple bigram model
class BigramLanguageModel(nn.Module):
def __init__(self):
super().__init__()
...
self.position_embedding_table = nn.Embedding(block_size, n_embd)
self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
...
...class Block(nn.Module):
""" Transformer block: communication followed by computation """
def __init__(self, n_embd, n_head):
# n_embd: embedding dimension, n_head: the number of heads we'd like
super().__init__()
head_size = n_embd // n_head
**self.sa = MultiHeadAttention(n_head, head_size)**
self.ffwd = FeedFoward(n_embd)
self.ln1 = nn.LayerNorm(n_embd)
self.ln2 = nn.LayerNorm(n_embd)
def forward(self, x):
x = x + self.sa(self.ln1(x)) # residual
x = x + self.ffwd(self.ln2(x)) # residual
return xclass MultiHeadAttention(nn.Module):
""" multiple heads of self-attention in parallel """
def __init__(self, num_heads, head_size):
super().__init__()
**self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])**
self.proj = nn.Linear(n_embd, n_embd)
self.dropout = nn.Dropout(dropout)
def forward(self, x):
**out = torch.cat([h(x) for h in self.heads], dim=-1)**
out = self.dropout(**self.proj(out)**)
return out
class Head(nn.Module):
""" one head of self-attention """
def __init__(self, head_size):
super().__init__()
self.key = nn.Linear(n_embd, head_size, bias=False)
self.query = nn.Linear(n_embd, head_size, bias=False)
self.value = nn.Linear(n_embd, head_size, bias=False)
**self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))**
self.dropout = nn.Dropout(dropout)
def forward(self, x):
B,T,C = x.shape
k = self.key(x) # (B,T,C)
q = self.query(x) # (B,T,C)
# compute attention scores ("affinities")
wei = q @ k.transpose(-2,-1) * C**-0.5 # (B, T, C) @ (B, C, T) -> (B, T, T)
**wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)**
wei = F.softmax(wei, dim=-1) # (B, T, T)
wei = self.dropout(wei)
# perform the weighted aggregation of the values
v = self.value(x) # (B,T,C)
out = wei @ v # (B, T, T) @ (B, T, C) -> (B, T, C)
return outmodel = BigramLanguageModel()
m = model.to(device)
# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')
# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
for iter in range(max_iters):
# every once in a while evaluate the loss on train and val sets
if iter % eval_interval == 0 or iter == max_iters - 1:
losses = estimate_loss()
print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
# sample a batch of data
xb, yb = get_batch('train')
**# evaluate the loss
logits, loss = model(xb, yb)
optimizer.zero_grad(set_to_none=True)
loss.backward()
optimizer.step()**context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=2000)[0].tolist()))BUCKINGHAM:
Thou, his lost to betchsed ingron
So you not me, slate ine, but that peerd
But connurdererisHards a wall that gleed:
Op, weive? But not bsorn of
Good, imis it to death! God's but the optrange your your many
his med and ne'er, no not, care
Annds my someds not; my wints go yoursay.
NORDIZANUO:
Yet while;
So marry upon il his been, live is me, wonst da my son,
that do he doung I ress patius;
Your enague should movence aste-timal
Trought that God, hor densless not;
As most; queech seeme
Misel, lible imbraid, to yet stand,
theirs is trurness away, and thou drue
And my chooly being Roman, for that shall to: where his norber rump matter you,
Who sweet not forth goty me would sgeen,
To last needs them set it! this you your bearth,
And there is me that bity the face here buty gates
I was have and Bacleingmery, is Dolk they graveight:
I did conteremp; I benam you.