Skip to content

Commit ae32ce7

Browse files
v1
1 parent 200bf0c commit ae32ce7

File tree

11 files changed

+258
-6
lines changed

11 files changed

+258
-6
lines changed

.gitmodules

+3
Original file line numberDiff line numberDiff line change
@@ -14231,3 +14231,6 @@
1423114231
[submodule "2024/07/19/petals_tensor_1"]
1423214232
path = 2024/07/19/petals_tensor_1
1423314233
url = https://github.com/hayotensor/petals_tensor_1
14234+
[submodule "2024/07/19/Phind-CodeLlama-34B-v2"]
14235+
path = 2024/07/19/Phind-CodeLlama-34B-v2
14236+
url = https://huggingface.co/Phind/Phind-CodeLlama-34B-v2

2023/05/01/FBGEMM

2023/07/17/experiments/pytorch

2024/02/29/bitsandbytes

2024/07/09/torch-mlir

Submodule torch-mlir updated 54 files

2024/07/17/notes.org

+32
Original file line numberDiff line numberDiff line change
@@ -84,3 +84,35 @@ will be more optimal so that cache lines will be more stable for different steps
8484

8585
By creating these future contracts and paying miners for blocks of work with risk of losing a large amount
8686
for cheating, we can reduce the risk.
87+
88+
89+
** idea 1
90+
91+
1. Clients wants to process N requests at the best price, in a time period X.
92+
2. They would escrow that money to smart contract.
93+
3. miners would form squads to bid on that contract, submitting work samples.
94+
they would bid on buying block N at price P and selling results for N+1 and price P +1.
95+
if the miner is able to optimize that block and do it faster/cheaper they can bid less cost or time and win contract.
96+
4. client would pick the best squad based on past performance,
97+
work samples, and price.
98+
5. each node would buy the inputs for processing, decrypt them, process them,
99+
produce a smaller checksum, 1 float per layer, and sell to next node, passing on the checksum.
100+
6. checksums will be aggregated, and published, with the hash of the input, inputhash + checksums
101+
7. if the client is not happy with the output, they can flag that transaction, and another squad can
102+
re-run the process, and decrypt the input with clients permission.
103+
8. if there is a mismatch then you would get confirmations, and then the bad work would be flagged.
104+
the confirmations would have to be done by a third party.
105+
106+
** idea truebit
107+
108+
1. Clients wants to process N requests at fixed prices, in a time period X.
109+
2. they deposit gas fees into escrow.
110+
3. blockchain assigns job to squad randomly.
111+
4. squad outputs good and bad values.
112+
5. validator checks those outputs and say pass or fail, if fail goes to arbitrage
113+
6. arbitrage : squad has to validate the work, secret is revealed, the loser pays legal fees
114+
both challegner and solver has to commit to computation steps. challenger checks subset of steps.
115+
challenger has to prove the solver is wrong, (interactive zkp).
116+
narrowing down to smaller and smaller problem (bisection).
117+
118+

2024/07/19/Phind-CodeLlama-34B-v2

Submodule Phind-CodeLlama-34B-v2 added at 949f61e

2024/07/20/notes.org

+18
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
Idea:
2+
3+
Take grammar of language, say the guile scheme or mes core language.
4+
Use only a fixed set of identifiers, say those in the core language, say mes.
5+
We can use the bootstrap code.
6+
Profile the execution of the compilation, capture the functions.
7+
Tokenize the language.
8+
Generate statements from the grammar.
9+
Generate invalid/valid statements.
10+
Look at the distribution of the encoding.
11+
12+
Reduce the token count to only the needed ones.
13+
Autoencode the embedding, reduce the size.
14+
Autoencode the first layer, reduce the size.
15+
16+
Finally we can then create a symbolic regression of the embedding and the first layer that will capture the core language.
17+
18+

2024/07/20/test.py

+198
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,198 @@
1+
import re
2+
import torch
3+
from torch import nn
4+
from torch import optim
5+
from torch.utils.data import DataLoader, TensorDataset
6+
import numpy as np
7+
import pandas as pd
8+
import matplotlib.pyplot as plt
9+
from collections import defaultdict
10+
11+
12+
# Tokenization
13+
def tokenize_mes(code):
14+
"""Basic tokenization for MES"""
15+
tokens = re.findall(r'\(|\)|[^\s()]+', code)
16+
return tokens
17+
18+
19+
def create_vocabulary(all_tokens):
20+
""" Create vocabulary"""
21+
return sorted(set(all_tokens))
22+
23+
24+
def tokens_to_indices(tokens, vocab):
25+
"""Convert tokens to indices"""
26+
return [vocab.index(token) for token in tokens]
27+
28+
29+
class EmbeddingAutoencoder(nn.Module):
30+
"""Autoencoder for embedding compression"""
31+
def __init__(self, vocab_size, embedding_dim, compressed_dim):
32+
super().__init__()
33+
self.embedding = nn.Embedding(vocab_size, embedding_dim)
34+
self.encoder = nn.Sequential(
35+
nn.Linear(embedding_dim, compressed_dim),
36+
nn.ReLU()
37+
)
38+
self.decoder = nn.Sequential(
39+
nn.Linear(compressed_dim, embedding_dim),
40+
nn.ReLU()
41+
)
42+
43+
def forward(self, input_x):
44+
"""forward function"""
45+
embedded = self.embedding(input_x)
46+
encoded = self.encoder(embedded)
47+
decoded = self.decoder(encoded)
48+
return decoded
49+
50+
51+
def process_mes_code(mes_code, embedding_dim=16,
52+
compressed_dim=4, epochs=1000):
53+
"""Example usage"""
54+
# Tokenize
55+
tokens = tokenize_mes(mes_code)
56+
57+
# Create vocabulary
58+
vocab = create_vocabulary(tokens)
59+
vocab_size = len(vocab)
60+
61+
# Convert to indices
62+
indices = tokens_to_indices(tokens, vocab)
63+
64+
# Prepare data for PyTorch
65+
data = torch.tensor(indices)
66+
dataset = TensorDataset(data)
67+
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)
68+
69+
# Initialize autoencoder
70+
model = EmbeddingAutoencoder(vocab_size, embedding_dim, compressed_dim)
71+
criterion = nn.MSELoss()
72+
optimizer = optim.Adam(model.parameters())
73+
74+
# Train
75+
for epoch in range(epochs):
76+
for batch in dataloader:
77+
inputs = batch[0]
78+
outputs = model(inputs)
79+
loss = criterion(outputs, model.embedding(inputs))
80+
optimizer.zero_grad()
81+
loss.backward()
82+
optimizer.step()
83+
84+
if (epoch + 1) % 10 == 0:
85+
print(f'Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}')
86+
87+
return model, vocab
88+
89+
90+
def save_first_embedding(model, file_path):
91+
"""Save the weights of the first embedding layer."""
92+
embedding_weights = model.embedding.weight.detach().cpu().numpy()
93+
np.save(file_path, embedding_weights)
94+
print(f"First embedding layer saved to {file_path}")
95+
96+
97+
def load_first_embedding(file_path):
98+
"""Load the saved weights of the first embedding layer."""
99+
return np.load(file_path)
100+
101+
102+
def token_value_report(model, vocab):
103+
"""Produce a report on the value of each token."""
104+
embedding_weights = model.embedding.weight.detach().cpu().numpy()
105+
106+
# Calculate the L2 norm of each token's embedding
107+
token_norms = np.linalg.norm(embedding_weights, axis=1)
108+
109+
# Calculate the cosine similarity between each pair of tokens
110+
similarity_matrix = embedding_weights @ embedding_weights.T
111+
norms = np.linalg.norm(embedding_weights, axis=1, keepdims=True)
112+
similarity_matrix /= norms
113+
similarity_matrix /= norms.T
114+
115+
# Find the most similar tokens for each token
116+
most_similar = defaultdict(list)
117+
for i, token in enumerate(vocab):
118+
similarities = similarity_matrix[i]
119+
most_similar_indices = np.argsort(similarities)[-6:-1]
120+
# Top 5 similar tokens (excluding self)
121+
most_similar[token] = [(vocab[idx], similarities[idx]) for idx in
122+
most_similar_indices[::-1]]
123+
124+
# Create a DataFrame with the results
125+
df = pd.DataFrame({
126+
'Token': vocab,
127+
'Embedding Norm': token_norms,
128+
'Most Similar Tokens': [most_similar[token] for token in vocab]
129+
})
130+
131+
# Sort by embedding norm (you could change this to sort by a different
132+
# metric if desired)
133+
df = df.sort_values('Embedding Norm', ascending=False).reset_index(
134+
drop=True)
135+
136+
return df
137+
138+
139+
def visualize_token_distribution(df):
140+
"""Visualize the distribution of token embedding norms."""
141+
plt.figure(figsize=(10, 6))
142+
plt.hist(df['Embedding Norm'], bins=30)
143+
plt.title('Distribution of Token Embedding Norms')
144+
plt.xlabel('Embedding Norm')
145+
plt.ylabel('Frequency')
146+
plt.savefig('token_norm_distribution.png')
147+
plt.close()
148+
149+
150+
def analyze_mes_embeddings(model, vocab, save_path='mes_first_embedding.npy'):
151+
# Save the first embedding layer
152+
save_first_embedding(model, save_path)
153+
154+
# Produce token value report
155+
report = token_value_report(model, vocab)
156+
157+
# Save report to CSV
158+
report.to_csv('token_value_report.csv', index=False)
159+
print("Token value report saved to token_value_report.csv")
160+
161+
# Visualize token distribution
162+
visualize_token_distribution(report)
163+
print("Token norm distribution plot saved to token_norm_distribution.png")
164+
165+
# Print top 10 tokens by embedding norm
166+
print("\nTop 10 tokens by embedding norm:")
167+
print(report[['Token', 'Embedding Norm']].head(10).to_string(index=False))
168+
169+
# Print example of similar tokens
170+
print("\nExample of similar tokens:")
171+
example_token = report['Token'].iloc[0]
172+
print(f"Tokens most similar to '{example_token}':")
173+
for token, similarity in report['Most Similar Tokens'].iloc[0]:
174+
print(f" {token}: {similarity:.4f}")
175+
176+
177+
def main():
178+
"""Driver"""
179+
180+
"""Example MES code (this is a simplified example)"""
181+
mes_code = """
182+
(define (factorial n)
183+
(if (= n 0)
184+
1
185+
(* n (factorial (- n 1)))))
186+
"""
187+
188+
# Process the code
189+
mes_model, vocab = process_mes_code(mes_code)
190+
191+
print("Vocabulary size:", len(vocab))
192+
print("Compressed embedding size:",
193+
mes_model.encoder[0].out_features)
194+
195+
analyze_mes_embeddings(mes_model, vocab)
196+
197+
198+
main()

0 commit comments

Comments
 (0)