drbh
commited on
Commit
·
43ffb32
1
Parent(s):
b975ca1
fix: cleanup test generations and update attributes
Browse files- .gitattributes +11 -1
- .venv/index.html +0 -24
- .venv/lib/index.html +0 -24
- .venv/lib/python3.11/index.html +0 -24
- .venv/lib/python3.11/site-packages/flask/index.html +0 -24
- .venv/lib/python3.11/site-packages/flask/sansio/index.html +0 -24
- .venv/lib/python3.11/site-packages/index.html +0 -26
- .venv/lib/python3.11/site-packages/markdown-3.9.dist-info/index.html +0 -24
- .venv/lib/python3.11/site-packages/markdown-3.9.dist-info/licenses/index.html +0 -24
- .venv/lib/python3.11/site-packages/werkzeug/debug/index.html +0 -24
- .venv/lib/python3.11/site-packages/werkzeug/debug/shared/index.html +0 -24
- .venv/lib/python3.11/site-packages/werkzeug/index.html +0 -24
- artifacts/charts/benchmark_dashboard.png +0 -0
- artifacts/charts/latency.png +0 -0
- artifacts/charts/memory.png +0 -0
- artifacts/charts/throughput.png +0 -0
- artifacts/setup/benchmark_avg_tokens_per_sec.txt +0 -1
- artifacts/setup/benchmark_dashboard.png +0 -0
- artifacts/setup/benchmark_memory.txt +0 -1
- artifacts/setup/benchmark_times.txt +0 -5
- cells/charts.py +0 -140
- cells/forward_and_backward.py +0 -102
- cells/forward_only.py +0 -96
- cells/nv.py +0 -3
- cells/setup.py +0 -116
- cells/setup2.py +0 -115
- index.html +0 -24
- megablocks_only.html +0 -0
- note.html +0 -0
- note_test_override.html +0 -0
- note_test_override.md +0 -261
- site/artifacts/charts/benchmark_dashboard.png +0 -0
- site/artifacts/charts/latency.png +0 -0
- site/artifacts/charts/memory.png +0 -0
- site/artifacts/charts/throughput.png +0 -0
- site/artifacts/setup/benchmark_avg_tokens_per_sec.txt +0 -1
- site/artifacts/setup/benchmark_dashboard.png +0 -0
- site/artifacts/setup/benchmark_memory.txt +0 -1
- site/artifacts/setup/benchmark_times.txt +0 -5
- site/cells/charts.py +0 -140
- site/cells/forward_and_backward.py +0 -102
- site/cells/forward_only.py +0 -96
- site/cells/setup.py +0 -116
- site/cells/setup2.py +0 -115
- site/megablocks_only.html +0 -0
- site/note.html +0 -0
- site/note_test_override.html +0 -0
- style.css +0 -28
.gitattributes
CHANGED
|
@@ -33,4 +33,14 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
# Image files
|
| 37 |
+
*.png filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
*.jpg filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
*.jpeg filter=lfs diff=lfs merge=lfs -text
|
| 40 |
+
*.gif filter=lfs diff=lfs merge=lfs -text
|
| 41 |
+
*.bmp filter=lfs diff=lfs merge=lfs -text
|
| 42 |
+
*.tiff filter=lfs diff=lfs merge=lfs -text
|
| 43 |
+
*.tif filter=lfs diff=lfs merge=lfs -text
|
| 44 |
+
*.webp filter=lfs diff=lfs merge=lfs -text
|
| 45 |
+
*.svg filter=lfs diff=lfs merge=lfs -text
|
| 46 |
+
*.ico filter=lfs diff=lfs merge=lfs -text
|
.venv/index.html
DELETED
|
@@ -1,24 +0,0 @@
|
|
| 1 |
-
<!DOCTYPE html>
|
| 2 |
-
<html>
|
| 3 |
-
<head>
|
| 4 |
-
<meta charset='UTF-8'>
|
| 5 |
-
<title>Directory Index</title>
|
| 6 |
-
<style>
|
| 7 |
-
body { font-family: monospace; margin: 20px; }
|
| 8 |
-
h1 { font-size: 1.5em; }
|
| 9 |
-
ul { list-style-type: none; padding-left: 20px; }
|
| 10 |
-
li { margin: 5px 0; }
|
| 11 |
-
.dir { font-weight: bold; }
|
| 12 |
-
.file { color: #0066cc; }
|
| 13 |
-
a { text-decoration: none; }
|
| 14 |
-
a:hover { text-decoration: underline; }
|
| 15 |
-
</style>
|
| 16 |
-
</head>
|
| 17 |
-
<body>
|
| 18 |
-
<h1>Index of /.venv</h1>
|
| 19 |
-
<ul>
|
| 20 |
-
<li><a href='../index.html' class='dir'>../</a></li>
|
| 21 |
-
<li><a href='lib/index.html' class='dir'>lib/</a></li>
|
| 22 |
-
</ul>
|
| 23 |
-
</body>
|
| 24 |
-
</html>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
.venv/lib/index.html
DELETED
|
@@ -1,24 +0,0 @@
|
|
| 1 |
-
<!DOCTYPE html>
|
| 2 |
-
<html>
|
| 3 |
-
<head>
|
| 4 |
-
<meta charset='UTF-8'>
|
| 5 |
-
<title>Directory Index</title>
|
| 6 |
-
<style>
|
| 7 |
-
body { font-family: monospace; margin: 20px; }
|
| 8 |
-
h1 { font-size: 1.5em; }
|
| 9 |
-
ul { list-style-type: none; padding-left: 20px; }
|
| 10 |
-
li { margin: 5px 0; }
|
| 11 |
-
.dir { font-weight: bold; }
|
| 12 |
-
.file { color: #0066cc; }
|
| 13 |
-
a { text-decoration: none; }
|
| 14 |
-
a:hover { text-decoration: underline; }
|
| 15 |
-
</style>
|
| 16 |
-
</head>
|
| 17 |
-
<body>
|
| 18 |
-
<h1>Index of /.venv/lib</h1>
|
| 19 |
-
<ul>
|
| 20 |
-
<li><a href='../index.html' class='dir'>../</a></li>
|
| 21 |
-
<li><a href='python3.11/index.html' class='dir'>python3.11/</a></li>
|
| 22 |
-
</ul>
|
| 23 |
-
</body>
|
| 24 |
-
</html>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
.venv/lib/python3.11/index.html
DELETED
|
@@ -1,24 +0,0 @@
|
|
| 1 |
-
<!DOCTYPE html>
|
| 2 |
-
<html>
|
| 3 |
-
<head>
|
| 4 |
-
<meta charset='UTF-8'>
|
| 5 |
-
<title>Directory Index</title>
|
| 6 |
-
<style>
|
| 7 |
-
body { font-family: monospace; margin: 20px; }
|
| 8 |
-
h1 { font-size: 1.5em; }
|
| 9 |
-
ul { list-style-type: none; padding-left: 20px; }
|
| 10 |
-
li { margin: 5px 0; }
|
| 11 |
-
.dir { font-weight: bold; }
|
| 12 |
-
.file { color: #0066cc; }
|
| 13 |
-
a { text-decoration: none; }
|
| 14 |
-
a:hover { text-decoration: underline; }
|
| 15 |
-
</style>
|
| 16 |
-
</head>
|
| 17 |
-
<body>
|
| 18 |
-
<h1>Index of /.venv/lib/python3.11</h1>
|
| 19 |
-
<ul>
|
| 20 |
-
<li><a href='../index.html' class='dir'>../</a></li>
|
| 21 |
-
<li><a href='site-packages/index.html' class='dir'>site-packages/</a></li>
|
| 22 |
-
</ul>
|
| 23 |
-
</body>
|
| 24 |
-
</html>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
.venv/lib/python3.11/site-packages/flask/index.html
DELETED
|
@@ -1,24 +0,0 @@
|
|
| 1 |
-
<!DOCTYPE html>
|
| 2 |
-
<html>
|
| 3 |
-
<head>
|
| 4 |
-
<meta charset='UTF-8'>
|
| 5 |
-
<title>Directory Index</title>
|
| 6 |
-
<style>
|
| 7 |
-
body { font-family: monospace; margin: 20px; }
|
| 8 |
-
h1 { font-size: 1.5em; }
|
| 9 |
-
ul { list-style-type: none; padding-left: 20px; }
|
| 10 |
-
li { margin: 5px 0; }
|
| 11 |
-
.dir { font-weight: bold; }
|
| 12 |
-
.file { color: #0066cc; }
|
| 13 |
-
a { text-decoration: none; }
|
| 14 |
-
a:hover { text-decoration: underline; }
|
| 15 |
-
</style>
|
| 16 |
-
</head>
|
| 17 |
-
<body>
|
| 18 |
-
<h1>Index of /.venv/lib/python3.11/site-packages/flask</h1>
|
| 19 |
-
<ul>
|
| 20 |
-
<li><a href='../index.html' class='dir'>../</a></li>
|
| 21 |
-
<li><a href='sansio/index.html' class='dir'>sansio/</a></li>
|
| 22 |
-
</ul>
|
| 23 |
-
</body>
|
| 24 |
-
</html>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
.venv/lib/python3.11/site-packages/flask/sansio/index.html
DELETED
|
@@ -1,24 +0,0 @@
|
|
| 1 |
-
<!DOCTYPE html>
|
| 2 |
-
<html>
|
| 3 |
-
<head>
|
| 4 |
-
<meta charset='UTF-8'>
|
| 5 |
-
<title>Directory Index</title>
|
| 6 |
-
<style>
|
| 7 |
-
body { font-family: monospace; margin: 20px; }
|
| 8 |
-
h1 { font-size: 1.5em; }
|
| 9 |
-
ul { list-style-type: none; padding-left: 20px; }
|
| 10 |
-
li { margin: 5px 0; }
|
| 11 |
-
.dir { font-weight: bold; }
|
| 12 |
-
.file { color: #0066cc; }
|
| 13 |
-
a { text-decoration: none; }
|
| 14 |
-
a:hover { text-decoration: underline; }
|
| 15 |
-
</style>
|
| 16 |
-
</head>
|
| 17 |
-
<body>
|
| 18 |
-
<h1>Index of /.venv/lib/python3.11/site-packages/flask/sansio</h1>
|
| 19 |
-
<ul>
|
| 20 |
-
<li><a href='../index.html' class='dir'>../</a></li>
|
| 21 |
-
<li><a href='README.html' class='file'>README.html</a></li>
|
| 22 |
-
</ul>
|
| 23 |
-
</body>
|
| 24 |
-
</html>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
.venv/lib/python3.11/site-packages/index.html
DELETED
|
@@ -1,26 +0,0 @@
|
|
| 1 |
-
<!DOCTYPE html>
|
| 2 |
-
<html>
|
| 3 |
-
<head>
|
| 4 |
-
<meta charset='UTF-8'>
|
| 5 |
-
<title>Directory Index</title>
|
| 6 |
-
<style>
|
| 7 |
-
body { font-family: monospace; margin: 20px; }
|
| 8 |
-
h1 { font-size: 1.5em; }
|
| 9 |
-
ul { list-style-type: none; padding-left: 20px; }
|
| 10 |
-
li { margin: 5px 0; }
|
| 11 |
-
.dir { font-weight: bold; }
|
| 12 |
-
.file { color: #0066cc; }
|
| 13 |
-
a { text-decoration: none; }
|
| 14 |
-
a:hover { text-decoration: underline; }
|
| 15 |
-
</style>
|
| 16 |
-
</head>
|
| 17 |
-
<body>
|
| 18 |
-
<h1>Index of /.venv/lib/python3.11/site-packages</h1>
|
| 19 |
-
<ul>
|
| 20 |
-
<li><a href='../index.html' class='dir'>../</a></li>
|
| 21 |
-
<li><a href='flask/index.html' class='dir'>flask/</a></li>
|
| 22 |
-
<li><a href='markdown-3.9.dist-info/index.html' class='dir'>markdown-3.9.dist-info/</a></li>
|
| 23 |
-
<li><a href='werkzeug/index.html' class='dir'>werkzeug/</a></li>
|
| 24 |
-
</ul>
|
| 25 |
-
</body>
|
| 26 |
-
</html>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
.venv/lib/python3.11/site-packages/markdown-3.9.dist-info/index.html
DELETED
|
@@ -1,24 +0,0 @@
|
|
| 1 |
-
<!DOCTYPE html>
|
| 2 |
-
<html>
|
| 3 |
-
<head>
|
| 4 |
-
<meta charset='UTF-8'>
|
| 5 |
-
<title>Directory Index</title>
|
| 6 |
-
<style>
|
| 7 |
-
body { font-family: monospace; margin: 20px; }
|
| 8 |
-
h1 { font-size: 1.5em; }
|
| 9 |
-
ul { list-style-type: none; padding-left: 20px; }
|
| 10 |
-
li { margin: 5px 0; }
|
| 11 |
-
.dir { font-weight: bold; }
|
| 12 |
-
.file { color: #0066cc; }
|
| 13 |
-
a { text-decoration: none; }
|
| 14 |
-
a:hover { text-decoration: underline; }
|
| 15 |
-
</style>
|
| 16 |
-
</head>
|
| 17 |
-
<body>
|
| 18 |
-
<h1>Index of /.venv/lib/python3.11/site-packages/markdown-3.9.dist-info</h1>
|
| 19 |
-
<ul>
|
| 20 |
-
<li><a href='../index.html' class='dir'>../</a></li>
|
| 21 |
-
<li><a href='licenses/index.html' class='dir'>licenses/</a></li>
|
| 22 |
-
</ul>
|
| 23 |
-
</body>
|
| 24 |
-
</html>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
.venv/lib/python3.11/site-packages/markdown-3.9.dist-info/licenses/index.html
DELETED
|
@@ -1,24 +0,0 @@
|
|
| 1 |
-
<!DOCTYPE html>
|
| 2 |
-
<html>
|
| 3 |
-
<head>
|
| 4 |
-
<meta charset='UTF-8'>
|
| 5 |
-
<title>Directory Index</title>
|
| 6 |
-
<style>
|
| 7 |
-
body { font-family: monospace; margin: 20px; }
|
| 8 |
-
h1 { font-size: 1.5em; }
|
| 9 |
-
ul { list-style-type: none; padding-left: 20px; }
|
| 10 |
-
li { margin: 5px 0; }
|
| 11 |
-
.dir { font-weight: bold; }
|
| 12 |
-
.file { color: #0066cc; }
|
| 13 |
-
a { text-decoration: none; }
|
| 14 |
-
a:hover { text-decoration: underline; }
|
| 15 |
-
</style>
|
| 16 |
-
</head>
|
| 17 |
-
<body>
|
| 18 |
-
<h1>Index of /.venv/lib/python3.11/site-packages/markdown-3.9.dist-info/licenses</h1>
|
| 19 |
-
<ul>
|
| 20 |
-
<li><a href='../index.html' class='dir'>../</a></li>
|
| 21 |
-
<li><a href='LICENSE.html' class='file'>LICENSE.html</a></li>
|
| 22 |
-
</ul>
|
| 23 |
-
</body>
|
| 24 |
-
</html>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
.venv/lib/python3.11/site-packages/werkzeug/debug/index.html
DELETED
|
@@ -1,24 +0,0 @@
|
|
| 1 |
-
<!DOCTYPE html>
|
| 2 |
-
<html>
|
| 3 |
-
<head>
|
| 4 |
-
<meta charset='UTF-8'>
|
| 5 |
-
<title>Directory Index</title>
|
| 6 |
-
<style>
|
| 7 |
-
body { font-family: monospace; margin: 20px; }
|
| 8 |
-
h1 { font-size: 1.5em; }
|
| 9 |
-
ul { list-style-type: none; padding-left: 20px; }
|
| 10 |
-
li { margin: 5px 0; }
|
| 11 |
-
.dir { font-weight: bold; }
|
| 12 |
-
.file { color: #0066cc; }
|
| 13 |
-
a { text-decoration: none; }
|
| 14 |
-
a:hover { text-decoration: underline; }
|
| 15 |
-
</style>
|
| 16 |
-
</head>
|
| 17 |
-
<body>
|
| 18 |
-
<h1>Index of /.venv/lib/python3.11/site-packages/werkzeug/debug</h1>
|
| 19 |
-
<ul>
|
| 20 |
-
<li><a href='../index.html' class='dir'>../</a></li>
|
| 21 |
-
<li><a href='shared/index.html' class='dir'>shared/</a></li>
|
| 22 |
-
</ul>
|
| 23 |
-
</body>
|
| 24 |
-
</html>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
.venv/lib/python3.11/site-packages/werkzeug/debug/shared/index.html
DELETED
|
@@ -1,24 +0,0 @@
|
|
| 1 |
-
<!DOCTYPE html>
|
| 2 |
-
<html>
|
| 3 |
-
<head>
|
| 4 |
-
<meta charset='UTF-8'>
|
| 5 |
-
<title>Directory Index</title>
|
| 6 |
-
<style>
|
| 7 |
-
body { font-family: monospace; margin: 20px; }
|
| 8 |
-
h1 { font-size: 1.5em; }
|
| 9 |
-
ul { list-style-type: none; padding-left: 20px; }
|
| 10 |
-
li { margin: 5px 0; }
|
| 11 |
-
.dir { font-weight: bold; }
|
| 12 |
-
.file { color: #0066cc; }
|
| 13 |
-
a { text-decoration: none; }
|
| 14 |
-
a:hover { text-decoration: underline; }
|
| 15 |
-
</style>
|
| 16 |
-
</head>
|
| 17 |
-
<body>
|
| 18 |
-
<h1>Index of /.venv/lib/python3.11/site-packages/werkzeug/debug/shared</h1>
|
| 19 |
-
<ul>
|
| 20 |
-
<li><a href='../index.html' class='dir'>../</a></li>
|
| 21 |
-
<li><a href='ICON_LICENSE.html' class='file'>ICON_LICENSE.html</a></li>
|
| 22 |
-
</ul>
|
| 23 |
-
</body>
|
| 24 |
-
</html>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
.venv/lib/python3.11/site-packages/werkzeug/index.html
DELETED
|
@@ -1,24 +0,0 @@
|
|
| 1 |
-
<!DOCTYPE html>
|
| 2 |
-
<html>
|
| 3 |
-
<head>
|
| 4 |
-
<meta charset='UTF-8'>
|
| 5 |
-
<title>Directory Index</title>
|
| 6 |
-
<style>
|
| 7 |
-
body { font-family: monospace; margin: 20px; }
|
| 8 |
-
h1 { font-size: 1.5em; }
|
| 9 |
-
ul { list-style-type: none; padding-left: 20px; }
|
| 10 |
-
li { margin: 5px 0; }
|
| 11 |
-
.dir { font-weight: bold; }
|
| 12 |
-
.file { color: #0066cc; }
|
| 13 |
-
a { text-decoration: none; }
|
| 14 |
-
a:hover { text-decoration: underline; }
|
| 15 |
-
</style>
|
| 16 |
-
</head>
|
| 17 |
-
<body>
|
| 18 |
-
<h1>Index of /.venv/lib/python3.11/site-packages/werkzeug</h1>
|
| 19 |
-
<ul>
|
| 20 |
-
<li><a href='../index.html' class='dir'>../</a></li>
|
| 21 |
-
<li><a href='debug/index.html' class='dir'>debug/</a></li>
|
| 22 |
-
</ul>
|
| 23 |
-
</body>
|
| 24 |
-
</html>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
artifacts/charts/benchmark_dashboard.png
DELETED
|
Binary file (87.7 kB)
|
|
|
artifacts/charts/latency.png
DELETED
|
Binary file (31.6 kB)
|
|
|
artifacts/charts/memory.png
DELETED
|
Binary file (46.3 kB)
|
|
|
artifacts/charts/throughput.png
DELETED
|
Binary file (37.4 kB)
|
|
|
artifacts/setup/benchmark_avg_tokens_per_sec.txt
DELETED
|
@@ -1 +0,0 @@
|
|
| 1 |
-
5.301658854167735
|
|
|
|
|
|
artifacts/setup/benchmark_dashboard.png
DELETED
|
Binary file (92.9 kB)
|
|
|
artifacts/setup/benchmark_memory.txt
DELETED
|
@@ -1 +0,0 @@
|
|
| 1 |
-
9.398672896,9.414898176,10.334765056
|
|
|
|
|
|
artifacts/setup/benchmark_times.txt
DELETED
|
@@ -1,5 +0,0 @@
|
|
| 1 |
-
12.075035744113848
|
| 2 |
-
12.0710428240709
|
| 3 |
-
12.070115809096023
|
| 4 |
-
12.070908240042627
|
| 5 |
-
12.071364195086062
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
cells/charts.py
DELETED
|
@@ -1,140 +0,0 @@
|
|
| 1 |
-
# /// script
|
| 2 |
-
# dependencies = [
|
| 3 |
-
# "matplotlib",
|
| 4 |
-
# "numpy",
|
| 5 |
-
# ]
|
| 6 |
-
# ///
|
| 7 |
-
|
| 8 |
-
import matplotlib.pyplot as plt
|
| 9 |
-
import numpy as np
|
| 10 |
-
import os
|
| 11 |
-
|
| 12 |
-
# get the pathf rom UVNOTE_SETUP env var
|
| 13 |
-
setup_path = os.getenv("UVNOTE_INPUT_SETUP", ".")
|
| 14 |
-
print(f"Reading benchmark data from: {setup_path}")
|
| 15 |
-
|
| 16 |
-
num_runs = 5
|
| 17 |
-
max_tokens = 64
|
| 18 |
-
times = []
|
| 19 |
-
with open(os.path.join(setup_path, "benchmark_times.txt"), "r") as f:
|
| 20 |
-
for line in f:
|
| 21 |
-
times.append(float(line.strip()))
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
avg_time = 0.0
|
| 25 |
-
min_time = 0.0
|
| 26 |
-
max_time = 0.0
|
| 27 |
-
final_mem = {"allocated_gb": 0.0, "peak_gb": 0.0, "reserved_gb": 0.0}
|
| 28 |
-
|
| 29 |
-
avg_tokens_per_sec = 0.0
|
| 30 |
-
with open(os.path.join(setup_path, "benchmark_avg_tokens_per_sec.txt"), "r") as f:
|
| 31 |
-
avg_tokens_per_sec = float(f.read().strip())
|
| 32 |
-
|
| 33 |
-
times_file = os.path.join(setup_path, "benchmark_times.txt")
|
| 34 |
-
memory_file = os.path.join(setup_path, "benchmark_memory.txt")
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
# Minimal brutalist palette (dark theme): grayscale + 1 accent
|
| 38 |
-
ACCENT = '#5ec8f8' # calm cyan-blue accent
|
| 39 |
-
FG = '#e6e6e6' # light gray text/lines
|
| 40 |
-
MUTED = '#9aa0a6' # muted gray for secondary
|
| 41 |
-
GRID = '#333333' # grid lines
|
| 42 |
-
|
| 43 |
-
# Styling tuned for clarity, high contrast, few colors
|
| 44 |
-
plt.style.use('dark_background')
|
| 45 |
-
plt.rcParams['figure.facecolor'] = 'none'
|
| 46 |
-
plt.rcParams['axes.facecolor'] = 'none'
|
| 47 |
-
plt.rcParams['savefig.facecolor'] = 'none'
|
| 48 |
-
plt.rcParams['savefig.transparent'] = True
|
| 49 |
-
plt.rcParams['font.family'] = 'monospace'
|
| 50 |
-
plt.rcParams['font.weight'] = 'bold'
|
| 51 |
-
plt.rcParams['axes.linewidth'] = 3
|
| 52 |
-
plt.rcParams['grid.linewidth'] = 2
|
| 53 |
-
plt.rcParams['lines.linewidth'] = 3
|
| 54 |
-
plt.rcParams['patch.linewidth'] = 2
|
| 55 |
-
|
| 56 |
-
# Prepare data
|
| 57 |
-
runs = list(range(1, len(times) + 1))
|
| 58 |
-
tokens_per_sec_all = [max_tokens / t for t in times]
|
| 59 |
-
|
| 60 |
-
# Chart 1: Throughput Performance
|
| 61 |
-
fig1, ax1 = plt.subplots(1, 1, figsize=(12, 6))
|
| 62 |
-
fig1.patch.set_alpha(0)
|
| 63 |
-
ax1.patch.set_alpha(0)
|
| 64 |
-
|
| 65 |
-
ax1.plot(runs, tokens_per_sec_all, color=ACCENT, marker='o', markersize=12,
|
| 66 |
-
markerfacecolor=ACCENT, markeredgecolor=FG, markeredgewidth=3, linewidth=5, label='tok/s')
|
| 67 |
-
ax1.fill_between(runs, 0, tokens_per_sec_all, alpha=0.2, color=ACCENT)
|
| 68 |
-
ax1.axhline(y=avg_tokens_per_sec, color=FG, linestyle='--', linewidth=3,
|
| 69 |
-
label=f'AVG: {avg_tokens_per_sec:.1f}')
|
| 70 |
-
ax1.set_title('THROUGHPUT PERFORMANCE', color=FG, fontsize=18, pad=20, fontweight='bold')
|
| 71 |
-
ax1.set_xlabel('RUN NUMBER', color=FG, fontsize=14, fontweight='bold')
|
| 72 |
-
ax1.set_ylabel('TOKENS/SEC', color=FG, fontsize=14, fontweight='bold')
|
| 73 |
-
ax1.grid(True, color=GRID, alpha=0.5, linewidth=2)
|
| 74 |
-
ax1.tick_params(colors=FG, labelsize=12)
|
| 75 |
-
legend1 = ax1.legend(frameon=False, loc='lower right')
|
| 76 |
-
for text in legend1.get_texts():
|
| 77 |
-
text.set_color(FG)
|
| 78 |
-
text.set_fontweight('bold')
|
| 79 |
-
plt.tight_layout()
|
| 80 |
-
plt.savefig('throughput.png', dpi=150, bbox_inches='tight', transparent=True)
|
| 81 |
-
plt.show()
|
| 82 |
-
|
| 83 |
-
# Chart 2: Generation Latency
|
| 84 |
-
fig2, ax2 = plt.subplots(1, 1, figsize=(12, 6))
|
| 85 |
-
fig2.patch.set_alpha(0)
|
| 86 |
-
ax2.patch.set_alpha(0)
|
| 87 |
-
|
| 88 |
-
bar_colors = [ACCENT if i % 2 == 0 else MUTED for i in range(len(times))]
|
| 89 |
-
bars = ax2.bar(runs, times, color=bar_colors, edgecolor=FG, linewidth=3, width=0.6)
|
| 90 |
-
ax2.axhline(y=avg_time, color=FG, linestyle='--', linewidth=3,
|
| 91 |
-
label=f'AVG: {avg_time:.2f}s')
|
| 92 |
-
for i, (run, time, bar) in enumerate(zip(runs, times, bars)):
|
| 93 |
-
ax2.text(run, time + 0.02, f'{time:.2f}s', ha='center', va='bottom',
|
| 94 |
-
color=FG, fontweight='bold', fontsize=11)
|
| 95 |
-
ax2.set_title('GENERATION LATENCY', color=FG, fontsize=18, pad=20, fontweight='bold')
|
| 96 |
-
ax2.set_xlabel('RUN NUMBER', color=FG, fontsize=14, fontweight='bold')
|
| 97 |
-
ax2.set_ylabel('TIME (SECONDS)', color=FG, fontsize=14, fontweight='bold')
|
| 98 |
-
ax2.grid(True, axis='y', color=GRID, alpha=0.5, linewidth=2)
|
| 99 |
-
ax2.tick_params(colors=FG, labelsize=12)
|
| 100 |
-
ax2.set_ylim(0, max(times) * 1.15)
|
| 101 |
-
legend2 = ax2.legend(frameon=False, loc='upper right')
|
| 102 |
-
for text in legend2.get_texts():
|
| 103 |
-
text.set_color(FG)
|
| 104 |
-
text.set_fontweight('bold')
|
| 105 |
-
plt.tight_layout()
|
| 106 |
-
plt.savefig('latency.png', dpi=150, bbox_inches='tight', transparent=True)
|
| 107 |
-
plt.show()
|
| 108 |
-
|
| 109 |
-
# Chart 3: Memory Usage
|
| 110 |
-
fig3, ax3 = plt.subplots(1, 1, figsize=(12, 6))
|
| 111 |
-
fig3.patch.set_alpha(0)
|
| 112 |
-
ax3.patch.set_alpha(0)
|
| 113 |
-
|
| 114 |
-
memory_labels = ['ALLOCATED', 'PEAK', 'RESERVED']
|
| 115 |
-
memory_values = [final_mem['allocated_gb'], final_mem['peak_gb'], final_mem['reserved_gb']]
|
| 116 |
-
colors_mem = [MUTED, ACCENT, FG]
|
| 117 |
-
bars = ax3.barh(memory_labels, memory_values, color=colors_mem, edgecolor=FG, linewidth=3, height=0.5)
|
| 118 |
-
for i, (label, value, bar) in enumerate(zip(memory_labels, memory_values, bars)):
|
| 119 |
-
ax3.text(value + 0.5, i, f'{value:.1f} GB', va='center',
|
| 120 |
-
color=FG, fontweight='bold', fontsize=13)
|
| 121 |
-
ax3.set_title('MEMORY USAGE', color=FG, fontsize=18, pad=20, fontweight='bold')
|
| 122 |
-
ax3.set_xlabel('GIGABYTES', color=FG, fontsize=14, fontweight='bold')
|
| 123 |
-
ax3.set_xlim(0, max(memory_values) * 1.3)
|
| 124 |
-
ax3.grid(True, axis='x', color=GRID, alpha=0.5, linewidth=2)
|
| 125 |
-
ax3.tick_params(colors=FG, labelsize=12)
|
| 126 |
-
ax3.set_yticks(range(len(memory_labels)))
|
| 127 |
-
ax3.set_yticklabels(memory_labels, fontweight='bold')
|
| 128 |
-
plt.tight_layout()
|
| 129 |
-
plt.savefig('memory.png', dpi=150, bbox_inches='tight', transparent=True)
|
| 130 |
-
plt.show()
|
| 131 |
-
|
| 132 |
-
print(f"\n📊 Charts saved as:")
|
| 133 |
-
print(f" • throughput.png")
|
| 134 |
-
print(f" • latency.png")
|
| 135 |
-
print(f" • memory.png")
|
| 136 |
-
print(f"\nBenchmark Summary:")
|
| 137 |
-
print(f" avg tokens/sec: {avg_tokens_per_sec:.1f}")
|
| 138 |
-
print(f" min time: {min_time:.3f}s")
|
| 139 |
-
print(f" max time: {max_time:.3f}s")
|
| 140 |
-
print(f" peak memory: {final_mem['peak_gb']:.2f}GB")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
cells/forward_and_backward.py
DELETED
|
@@ -1,102 +0,0 @@
|
|
| 1 |
-
# /// script
|
| 2 |
-
# requires-python = ">=3.12"
|
| 3 |
-
# dependencies = [
|
| 4 |
-
# "accelerate>=1.10.1",
|
| 5 |
-
# "torch>=2.7.0",
|
| 6 |
-
# "kernels==0.10.0",
|
| 7 |
-
# "transformers@https://github.com/huggingface/transformers.git",
|
| 8 |
-
# "ipdb>=0.13.13",
|
| 9 |
-
# "matplotlib>=3.7.2",
|
| 10 |
-
# "numpy>=1.24.3",
|
| 11 |
-
# ]
|
| 12 |
-
# ///
|
| 13 |
-
|
| 14 |
-
import torch
|
| 15 |
-
from transformers import GptOssForCausalLM, PreTrainedTokenizerFast, Mxfp4Config
|
| 16 |
-
import time
|
| 17 |
-
import torch.nn as nn
|
| 18 |
-
from kernels import register_kernel_mapping, Mode, LayerRepository
|
| 19 |
-
import sys
|
| 20 |
-
import torch.profiler
|
| 21 |
-
import gc
|
| 22 |
-
import logging
|
| 23 |
-
from transformers.models.gpt_oss.modeling_gpt_oss import GptOssRMSNorm
|
| 24 |
-
|
| 25 |
-
# remove liger kernel for testing
|
| 26 |
-
replace_kernel_forward_from_hub(GptOssRMSNorm, None)
|
| 27 |
-
|
| 28 |
-
# set to debug logging
|
| 29 |
-
logging.basicConfig(level=logging.INFO)
|
| 30 |
-
|
| 31 |
-
def reset_peak_memory_stats():
|
| 32 |
-
"""Clear CUDA cache and reset memory allocation counters."""
|
| 33 |
-
torch.cuda.empty_cache()
|
| 34 |
-
if torch.cuda.is_available():
|
| 35 |
-
torch.cuda.reset_peak_memory_stats()
|
| 36 |
-
gc.collect()
|
| 37 |
-
|
| 38 |
-
def get_memory_stats():
|
| 39 |
-
"""Get current and peak CUDA memory usage."""
|
| 40 |
-
if not torch.cuda.is_available():
|
| 41 |
-
return {"allocated_gb": 0, "peak_gb": 0, "reserved_gb": 0}
|
| 42 |
-
return {
|
| 43 |
-
"allocated_gb": torch.cuda.memory_allocated() / 1e9,
|
| 44 |
-
"peak_gb": torch.cuda.max_memory_allocated() / 1e9,
|
| 45 |
-
"reserved_gb": torch.cuda.memory_reserved() / 1e9,
|
| 46 |
-
}
|
| 47 |
-
|
| 48 |
-
def override_kernel_layer_name(cls_name: str, value) -> bool:
|
| 49 |
-
"""Helper to dynamically override the kernel_layer_name in a model class."""
|
| 50 |
-
for mod in sys.modules.values():
|
| 51 |
-
if mod is None:
|
| 52 |
-
continue
|
| 53 |
-
obj = getattr(mod, cls_name, None)
|
| 54 |
-
if isinstance(obj, type) and issubclass(obj, nn.Module):
|
| 55 |
-
setattr(obj, "kernel_layer_name", value)
|
| 56 |
-
print(f"Overrode {cls_name}.kernel_layer_name to {value}")
|
| 57 |
-
return True
|
| 58 |
-
return False
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
# Init the model the normal way
|
| 62 |
-
model_id = "openai/gpt-oss-20b"
|
| 63 |
-
tokenizer = PreTrainedTokenizerFast.from_pretrained(model_id)
|
| 64 |
-
quantization_config = Mxfp4Config(dequantize=True)
|
| 65 |
-
|
| 66 |
-
model = GptOssForCausalLM.from_pretrained(
|
| 67 |
-
model_id,
|
| 68 |
-
dtype="bfloat16",
|
| 69 |
-
device_map="auto",
|
| 70 |
-
use_kernels=True,
|
| 71 |
-
quantization_config=quantization_config,
|
| 72 |
-
training=True,
|
| 73 |
-
).eval()
|
| 74 |
-
|
| 75 |
-
messages = [
|
| 76 |
-
{"role": "system", "content": "What is Tensor Parallelism?"},
|
| 77 |
-
]
|
| 78 |
-
|
| 79 |
-
inputs = tokenizer.apply_chat_template(
|
| 80 |
-
messages,
|
| 81 |
-
add_generation_prompt=True,
|
| 82 |
-
return_tensors="pt",
|
| 83 |
-
return_dict=True,
|
| 84 |
-
reasoning_effort="low",
|
| 85 |
-
).to("cuda")
|
| 86 |
-
|
| 87 |
-
max_tokens = 512
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
# forward and backward pass
|
| 91 |
-
with torch.autograd.set_grad_enabled(True):
|
| 92 |
-
start_time = time.perf_counter()
|
| 93 |
-
generated = model.generate(
|
| 94 |
-
**inputs,
|
| 95 |
-
max_new_tokens=max_tokens,
|
| 96 |
-
do_sample=False,
|
| 97 |
-
temperature=None,
|
| 98 |
-
)
|
| 99 |
-
end_time = time.perf_counter()
|
| 100 |
-
print(tokenizer.decode(generated[0], skip_special_tokens=False))
|
| 101 |
-
print(f"Generation took {end_time - start_time:.2f} seconds")
|
| 102 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
cells/forward_only.py
DELETED
|
@@ -1,96 +0,0 @@
|
|
| 1 |
-
# /// script
|
| 2 |
-
# requires-python = ">=3.12"
|
| 3 |
-
# dependencies = [
|
| 4 |
-
# "accelerate>=1.10.1",
|
| 5 |
-
# "torch>=2.7.0",
|
| 6 |
-
# "kernels==0.10.0",
|
| 7 |
-
# "transformers@https://github.com/huggingface/transformers.git",
|
| 8 |
-
# "ipdb>=0.13.13",
|
| 9 |
-
# "matplotlib>=3.7.2",
|
| 10 |
-
# "numpy>=1.24.3",
|
| 11 |
-
# ]
|
| 12 |
-
# ///
|
| 13 |
-
|
| 14 |
-
import torch
|
| 15 |
-
from transformers import GptOssForCausalLM, PreTrainedTokenizerFast, Mxfp4Config
|
| 16 |
-
import time
|
| 17 |
-
import torch.nn as nn
|
| 18 |
-
from kernels import register_kernel_mapping, Mode, LayerRepository
|
| 19 |
-
import sys
|
| 20 |
-
import torch.profiler
|
| 21 |
-
import gc
|
| 22 |
-
import logging
|
| 23 |
-
|
| 24 |
-
# set to debug logging
|
| 25 |
-
logging.basicConfig(level=logging.INFO)
|
| 26 |
-
|
| 27 |
-
def reset_peak_memory_stats():
|
| 28 |
-
"""Clear CUDA cache and reset memory allocation counters."""
|
| 29 |
-
torch.cuda.empty_cache()
|
| 30 |
-
if torch.cuda.is_available():
|
| 31 |
-
torch.cuda.reset_peak_memory_stats()
|
| 32 |
-
gc.collect()
|
| 33 |
-
|
| 34 |
-
def get_memory_stats():
|
| 35 |
-
"""Get current and peak CUDA memory usage."""
|
| 36 |
-
if not torch.cuda.is_available():
|
| 37 |
-
return {"allocated_gb": 0, "peak_gb": 0, "reserved_gb": 0}
|
| 38 |
-
return {
|
| 39 |
-
"allocated_gb": torch.cuda.memory_allocated() / 1e9,
|
| 40 |
-
"peak_gb": torch.cuda.max_memory_allocated() / 1e9,
|
| 41 |
-
"reserved_gb": torch.cuda.memory_reserved() / 1e9,
|
| 42 |
-
}
|
| 43 |
-
|
| 44 |
-
def override_kernel_layer_name(cls_name: str, value) -> bool:
|
| 45 |
-
"""Helper to dynamically override the kernel_layer_name in a model class."""
|
| 46 |
-
for mod in sys.modules.values():
|
| 47 |
-
if mod is None:
|
| 48 |
-
continue
|
| 49 |
-
obj = getattr(mod, cls_name, None)
|
| 50 |
-
if isinstance(obj, type) and issubclass(obj, nn.Module):
|
| 51 |
-
setattr(obj, "kernel_layer_name", value)
|
| 52 |
-
print(f"Overrode {cls_name}.kernel_layer_name to {value}")
|
| 53 |
-
return True
|
| 54 |
-
return False
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
# Init the model the normal way
|
| 58 |
-
model_id = "openai/gpt-oss-20b"
|
| 59 |
-
tokenizer = PreTrainedTokenizerFast.from_pretrained(model_id)
|
| 60 |
-
quantization_config = Mxfp4Config(dequantize=True)
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
model = GptOssForCausalLM.from_pretrained(
|
| 64 |
-
model_id,
|
| 65 |
-
dtype="bfloat16",
|
| 66 |
-
device_map="auto",
|
| 67 |
-
use_kernels=True,
|
| 68 |
-
quantization_config=quantization_config,
|
| 69 |
-
).eval()
|
| 70 |
-
|
| 71 |
-
messages = [
|
| 72 |
-
{"role": "system", "content": "What is Tensor Parallelism?"},
|
| 73 |
-
]
|
| 74 |
-
|
| 75 |
-
inputs = tokenizer.apply_chat_template(
|
| 76 |
-
messages,
|
| 77 |
-
add_generation_prompt=True,
|
| 78 |
-
return_tensors="pt",
|
| 79 |
-
return_dict=True,
|
| 80 |
-
reasoning_effort="low",
|
| 81 |
-
).to("cuda")
|
| 82 |
-
|
| 83 |
-
max_tokens = 512
|
| 84 |
-
|
| 85 |
-
with torch.inference_mode():
|
| 86 |
-
start_time = time.perf_counter()
|
| 87 |
-
generated = model.generate(
|
| 88 |
-
**inputs,
|
| 89 |
-
max_new_tokens=max_tokens,
|
| 90 |
-
do_sample=False,
|
| 91 |
-
temperature=None,
|
| 92 |
-
)
|
| 93 |
-
end_time = time.perf_counter()
|
| 94 |
-
|
| 95 |
-
print(tokenizer.decode(generated[0], skip_special_tokens=False))
|
| 96 |
-
print(f"Generation took {end_time - start_time:.2f} seconds")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
cells/nv.py
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
import subprocess
|
| 2 |
-
|
| 3 |
-
print(subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout)
|
|
|
|
|
|
|
|
|
|
|
|
cells/setup.py
DELETED
|
@@ -1,116 +0,0 @@
|
|
| 1 |
-
# /// script
|
| 2 |
-
# requires-python = ">=3.12"
|
| 3 |
-
# dependencies = [
|
| 4 |
-
# "accelerate>=1.10.1",
|
| 5 |
-
# "torch>=2.7.0",
|
| 6 |
-
# "kernels==0.10.0",
|
| 7 |
-
# "transformers@https://github.com/huggingface/transformers.git",
|
| 8 |
-
# "ipdb>=0.13.13",
|
| 9 |
-
# "matplotlib>=3.7.2",
|
| 10 |
-
# "numpy>=1.24.3",
|
| 11 |
-
# ]
|
| 12 |
-
# ///
|
| 13 |
-
|
| 14 |
-
import torch
|
| 15 |
-
from transformers import GptOssForCausalLM, PreTrainedTokenizerFast, Mxfp4Config
|
| 16 |
-
import time
|
| 17 |
-
import torch.nn as nn
|
| 18 |
-
from kernels import register_kernel_mapping, Mode, LayerRepository
|
| 19 |
-
import sys
|
| 20 |
-
import torch.profiler
|
| 21 |
-
import gc
|
| 22 |
-
import logging
|
| 23 |
-
|
| 24 |
-
# set to debug logging
|
| 25 |
-
logging.basicConfig(level=logging.INFO)
|
| 26 |
-
|
| 27 |
-
def reset_peak_memory_stats():
|
| 28 |
-
"""Clear CUDA cache and reset memory allocation counters."""
|
| 29 |
-
torch.cuda.empty_cache()
|
| 30 |
-
if torch.cuda.is_available():
|
| 31 |
-
torch.cuda.reset_peak_memory_stats()
|
| 32 |
-
gc.collect()
|
| 33 |
-
|
| 34 |
-
def get_memory_stats():
|
| 35 |
-
"""Get current and peak CUDA memory usage."""
|
| 36 |
-
if not torch.cuda.is_available():
|
| 37 |
-
return {"allocated_gb": 0, "peak_gb": 0, "reserved_gb": 0}
|
| 38 |
-
return {
|
| 39 |
-
"allocated_gb": torch.cuda.memory_allocated() / 1e9,
|
| 40 |
-
"peak_gb": torch.cuda.max_memory_allocated() / 1e9,
|
| 41 |
-
"reserved_gb": torch.cuda.memory_reserved() / 1e9,
|
| 42 |
-
}
|
| 43 |
-
|
| 44 |
-
def override_kernel_layer_name(cls_name: str, value) -> bool:
|
| 45 |
-
"""Helper to dynamically override the kernel_layer_name in a model class."""
|
| 46 |
-
for mod in sys.modules.values():
|
| 47 |
-
if mod is None:
|
| 48 |
-
continue
|
| 49 |
-
obj = getattr(mod, cls_name, None)
|
| 50 |
-
if isinstance(obj, type) and issubclass(obj, nn.Module):
|
| 51 |
-
setattr(obj, "kernel_layer_name", value)
|
| 52 |
-
print(f"Overrode {cls_name}.kernel_layer_name to {value}")
|
| 53 |
-
return True
|
| 54 |
-
return False
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
# Init the model the normal way
|
| 58 |
-
model_id = "openai/gpt-oss-20b"
|
| 59 |
-
tokenizer = PreTrainedTokenizerFast.from_pretrained(model_id)
|
| 60 |
-
quantization_config = Mxfp4Config(dequantize=True)
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
from kernels import replace_kernel_forward_from_hub, register_kernel_mapping, LayerRepository, Mode
|
| 64 |
-
|
| 65 |
-
from transformers.models.gpt_oss.modeling_gpt_oss import GptOssMLP, GptOssRMSNorm
|
| 66 |
-
|
| 67 |
-
replace_kernel_forward_from_hub(GptOssMLP, "Yamoe") # direct, type-safe
|
| 68 |
-
replace_kernel_forward_from_hub(GptOssRMSNorm, None) # direct, type-safe
|
| 69 |
-
custom_mapping = {
|
| 70 |
-
"Yamoe": {
|
| 71 |
-
"cuda": {
|
| 72 |
-
Mode.INFERENCE: LayerRepository(
|
| 73 |
-
repo_id="drbh/yamoe",
|
| 74 |
-
layer_name="Yamoe",
|
| 75 |
-
revision="v0.3.0",
|
| 76 |
-
)
|
| 77 |
-
}
|
| 78 |
-
}
|
| 79 |
-
}
|
| 80 |
-
register_kernel_mapping(custom_mapping)
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
model = GptOssForCausalLM.from_pretrained(
|
| 84 |
-
model_id,
|
| 85 |
-
dtype="bfloat16",
|
| 86 |
-
device_map="auto",
|
| 87 |
-
use_kernels=True,
|
| 88 |
-
quantization_config=quantization_config,
|
| 89 |
-
).eval()
|
| 90 |
-
|
| 91 |
-
messages = [
|
| 92 |
-
{"role": "system", "content": "What is Tensor Parallelism?"},
|
| 93 |
-
]
|
| 94 |
-
|
| 95 |
-
inputs = tokenizer.apply_chat_template(
|
| 96 |
-
messages,
|
| 97 |
-
add_generation_prompt=True,
|
| 98 |
-
return_tensors="pt",
|
| 99 |
-
return_dict=True,
|
| 100 |
-
reasoning_effort="low",
|
| 101 |
-
).to("cuda")
|
| 102 |
-
|
| 103 |
-
max_tokens = 512
|
| 104 |
-
|
| 105 |
-
with torch.inference_mode():
|
| 106 |
-
start_time = time.perf_counter()
|
| 107 |
-
generated = model.generate(
|
| 108 |
-
**inputs,
|
| 109 |
-
max_new_tokens=max_tokens,
|
| 110 |
-
do_sample=False,
|
| 111 |
-
temperature=None,
|
| 112 |
-
)
|
| 113 |
-
end_time = time.perf_counter()
|
| 114 |
-
|
| 115 |
-
print(tokenizer.decode(generated[0], skip_special_tokens=False))
|
| 116 |
-
print(f"Generation took {end_time - start_time:.2f} seconds")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
cells/setup2.py
DELETED
|
@@ -1,115 +0,0 @@
|
|
| 1 |
-
# /// script
|
| 2 |
-
# requires-python = ">=3.12"
|
| 3 |
-
# dependencies = [
|
| 4 |
-
# "accelerate>=1.10.1",
|
| 5 |
-
# "torch>=2.7.0",
|
| 6 |
-
# "kernels==0.10.0",
|
| 7 |
-
# "transformers@https://github.com/huggingface/transformers.git",
|
| 8 |
-
# "ipdb>=0.13.13",
|
| 9 |
-
# "matplotlib>=3.7.2",
|
| 10 |
-
# "numpy>=1.24.3",
|
| 11 |
-
# ]
|
| 12 |
-
# ///
|
| 13 |
-
|
| 14 |
-
import torch
|
| 15 |
-
from transformers import GptOssForCausalLM, PreTrainedTokenizerFast, Mxfp4Config
|
| 16 |
-
import time
|
| 17 |
-
import torch.nn as nn
|
| 18 |
-
from kernels import register_kernel_mapping, Mode, LayerRepository
|
| 19 |
-
import sys
|
| 20 |
-
import torch.profiler
|
| 21 |
-
import gc
|
| 22 |
-
import logging
|
| 23 |
-
|
| 24 |
-
# set to debug logging
|
| 25 |
-
logging.basicConfig(level=logging.INFO)
|
| 26 |
-
|
| 27 |
-
def reset_peak_memory_stats():
|
| 28 |
-
"""Clear CUDA cache and reset memory allocation counters."""
|
| 29 |
-
torch.cuda.empty_cache()
|
| 30 |
-
if torch.cuda.is_available():
|
| 31 |
-
torch.cuda.reset_peak_memory_stats()
|
| 32 |
-
gc.collect()
|
| 33 |
-
|
| 34 |
-
def get_memory_stats():
|
| 35 |
-
"""Get current and peak CUDA memory usage."""
|
| 36 |
-
if not torch.cuda.is_available():
|
| 37 |
-
return {"allocated_gb": 0, "peak_gb": 0, "reserved_gb": 0}
|
| 38 |
-
return {
|
| 39 |
-
"allocated_gb": torch.cuda.memory_allocated() / 1e9,
|
| 40 |
-
"peak_gb": torch.cuda.max_memory_allocated() / 1e9,
|
| 41 |
-
"reserved_gb": torch.cuda.memory_reserved() / 1e9,
|
| 42 |
-
}
|
| 43 |
-
|
| 44 |
-
def override_kernel_layer_name(cls_name: str, value) -> bool:
|
| 45 |
-
"""Helper to dynamically override the kernel_layer_name in a model class."""
|
| 46 |
-
for mod in sys.modules.values():
|
| 47 |
-
if mod is None:
|
| 48 |
-
continue
|
| 49 |
-
obj = getattr(mod, cls_name, None)
|
| 50 |
-
if isinstance(obj, type) and issubclass(obj, nn.Module):
|
| 51 |
-
setattr(obj, "kernel_layer_name", value)
|
| 52 |
-
print(f"Overrode {cls_name}.kernel_layer_name to {value}")
|
| 53 |
-
return True
|
| 54 |
-
return False
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
# Init the model the normal way
|
| 58 |
-
model_id = "openai/gpt-oss-20b"
|
| 59 |
-
tokenizer = PreTrainedTokenizerFast.from_pretrained(model_id)
|
| 60 |
-
quantization_config = Mxfp4Config(dequantize=True)
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
from kernels import replace_kernel_forward_from_hub, register_kernel_mapping, LayerRepository, Mode
|
| 64 |
-
|
| 65 |
-
from transformers.models.gpt_oss.modeling_gpt_oss import GptOssMLP, GptOssRMSNorm
|
| 66 |
-
|
| 67 |
-
replace_kernel_forward_from_hub(GptOssRMSNorm, None) # direct, type-safe
|
| 68 |
-
custom_mapping = {
|
| 69 |
-
"Yamoe": {
|
| 70 |
-
"cuda": {
|
| 71 |
-
Mode.INFERENCE: LayerRepository(
|
| 72 |
-
repo_id="drbh/yamoe",
|
| 73 |
-
layer_name="Yamoe",
|
| 74 |
-
revision="v0.3.0",
|
| 75 |
-
)
|
| 76 |
-
}
|
| 77 |
-
}
|
| 78 |
-
}
|
| 79 |
-
register_kernel_mapping(custom_mapping)
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
model = GptOssForCausalLM.from_pretrained(
|
| 83 |
-
model_id,
|
| 84 |
-
dtype="bfloat16",
|
| 85 |
-
device_map="auto",
|
| 86 |
-
use_kernels=True,
|
| 87 |
-
quantization_config=quantization_config,
|
| 88 |
-
).eval()
|
| 89 |
-
|
| 90 |
-
messages = [
|
| 91 |
-
{"role": "system", "content": "What is Tensor Parallelism?"},
|
| 92 |
-
]
|
| 93 |
-
|
| 94 |
-
inputs = tokenizer.apply_chat_template(
|
| 95 |
-
messages,
|
| 96 |
-
add_generation_prompt=True,
|
| 97 |
-
return_tensors="pt",
|
| 98 |
-
return_dict=True,
|
| 99 |
-
reasoning_effort="low",
|
| 100 |
-
).to("cuda")
|
| 101 |
-
|
| 102 |
-
max_tokens = 512
|
| 103 |
-
|
| 104 |
-
with torch.inference_mode():
|
| 105 |
-
start_time = time.perf_counter()
|
| 106 |
-
generated = model.generate(
|
| 107 |
-
**inputs,
|
| 108 |
-
max_new_tokens=max_tokens,
|
| 109 |
-
do_sample=False,
|
| 110 |
-
temperature=None,
|
| 111 |
-
)
|
| 112 |
-
end_time = time.perf_counter()
|
| 113 |
-
|
| 114 |
-
print(tokenizer.decode(generated[0], skip_special_tokens=False))
|
| 115 |
-
print(f"Generation took {end_time - start_time:.2f} seconds")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
index.html
DELETED
|
@@ -1,24 +0,0 @@
|
|
| 1 |
-
<!DOCTYPE html>
|
| 2 |
-
<html>
|
| 3 |
-
<head>
|
| 4 |
-
<meta charset='UTF-8'>
|
| 5 |
-
<title>Directory Index</title>
|
| 6 |
-
<style>
|
| 7 |
-
body { font-family: monospace; margin: 20px; }
|
| 8 |
-
h1 { font-size: 1.5em; }
|
| 9 |
-
ul { list-style-type: none; padding-left: 20px; }
|
| 10 |
-
li { margin: 5px 0; }
|
| 11 |
-
.dir { font-weight: bold; }
|
| 12 |
-
.file { color: #0066cc; }
|
| 13 |
-
a { text-decoration: none; }
|
| 14 |
-
a:hover { text-decoration: underline; }
|
| 15 |
-
</style>
|
| 16 |
-
</head>
|
| 17 |
-
<body>
|
| 18 |
-
<h1>Index of /</h1>
|
| 19 |
-
<ul>
|
| 20 |
-
<li><a href='flash_attn/index.html' class='dir'>flash_attn/</a></li>
|
| 21 |
-
<li><a href='moe_benchmarks/index.html' class='dir'>moe_benchmarks/</a></li>
|
| 22 |
-
</ul>
|
| 23 |
-
</body>
|
| 24 |
-
</html>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
megablocks_only.html
DELETED
|
The diff for this file is too large to render.
See raw diff
|
|
|
note.html
DELETED
|
The diff for this file is too large to render.
See raw diff
|
|
|
note_test_override.html
DELETED
|
The diff for this file is too large to render.
See raw diff
|
|
|
note_test_override.md
DELETED
|
@@ -1,261 +0,0 @@
|
|
| 1 |
-
---
|
| 2 |
-
title: "uvnote Integration Test Report"
|
| 3 |
-
author: "uvnote"
|
| 4 |
-
theme: "light"
|
| 5 |
-
syntax_theme: "monokai"
|
| 6 |
-
show_line_numbers: true
|
| 7 |
-
collapse_code: false
|
| 8 |
-
custom_css: |
|
| 9 |
-
#output-setup {
|
| 10 |
-
overflow-x: auto;
|
| 11 |
-
}
|
| 12 |
-
.cell-stdout {
|
| 13 |
-
width: 100%;
|
| 14 |
-
}
|
| 15 |
-
.cell-stderr {
|
| 16 |
-
width: max-content;
|
| 17 |
-
max-height: 300px;
|
| 18 |
-
overflow: auto;
|
| 19 |
-
}
|
| 20 |
-
---
|
| 21 |
-
|
| 22 |
-
```python id=setup
|
| 23 |
-
# /// script
|
| 24 |
-
# requires-python = ">=3.12"
|
| 25 |
-
# dependencies = [
|
| 26 |
-
# "accelerate>=1.10.1",
|
| 27 |
-
# "torch>=2.7.0",
|
| 28 |
-
# "kernels==0.10.0",
|
| 29 |
-
# "transformers@https://github.com/huggingface/transformers.git",
|
| 30 |
-
# "ipdb>=0.13.13",
|
| 31 |
-
# "matplotlib>=3.7.2",
|
| 32 |
-
# "numpy>=1.24.3",
|
| 33 |
-
# ]
|
| 34 |
-
# ///
|
| 35 |
-
|
| 36 |
-
import torch
|
| 37 |
-
from transformers import GptOssForCausalLM, PreTrainedTokenizerFast, Mxfp4Config
|
| 38 |
-
import time
|
| 39 |
-
import torch.nn as nn
|
| 40 |
-
from kernels import register_kernel_mapping, Mode, LayerRepository
|
| 41 |
-
import sys
|
| 42 |
-
import torch.profiler
|
| 43 |
-
import gc
|
| 44 |
-
import logging
|
| 45 |
-
|
| 46 |
-
# set to debug logging
|
| 47 |
-
logging.basicConfig(level=logging.INFO)
|
| 48 |
-
|
| 49 |
-
def reset_peak_memory_stats():
|
| 50 |
-
"""Clear CUDA cache and reset memory allocation counters."""
|
| 51 |
-
torch.cuda.empty_cache()
|
| 52 |
-
if torch.cuda.is_available():
|
| 53 |
-
torch.cuda.reset_peak_memory_stats()
|
| 54 |
-
gc.collect()
|
| 55 |
-
|
| 56 |
-
def get_memory_stats():
|
| 57 |
-
"""Get current and peak CUDA memory usage."""
|
| 58 |
-
if not torch.cuda.is_available():
|
| 59 |
-
return {"allocated_gb": 0, "peak_gb": 0, "reserved_gb": 0}
|
| 60 |
-
return {
|
| 61 |
-
"allocated_gb": torch.cuda.memory_allocated() / 1e9,
|
| 62 |
-
"peak_gb": torch.cuda.max_memory_allocated() / 1e9,
|
| 63 |
-
"reserved_gb": torch.cuda.memory_reserved() / 1e9,
|
| 64 |
-
}
|
| 65 |
-
|
| 66 |
-
def override_kernel_layer_name(cls_name: str, value) -> bool:
|
| 67 |
-
"""Helper to dynamically override the kernel_layer_name in a model class."""
|
| 68 |
-
for mod in sys.modules.values():
|
| 69 |
-
if mod is None:
|
| 70 |
-
continue
|
| 71 |
-
obj = getattr(mod, cls_name, None)
|
| 72 |
-
if isinstance(obj, type) and issubclass(obj, nn.Module):
|
| 73 |
-
setattr(obj, "kernel_layer_name", value)
|
| 74 |
-
print(f"Overrode {cls_name}.kernel_layer_name to {value}")
|
| 75 |
-
return True
|
| 76 |
-
return False
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
# Init the model the normal way
|
| 80 |
-
model_id = "openai/gpt-oss-20b"
|
| 81 |
-
tokenizer = PreTrainedTokenizerFast.from_pretrained(model_id)
|
| 82 |
-
quantization_config = Mxfp4Config(dequantize=True)
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
from kernels import replace_kernel_forward_from_hub, register_kernel_mapping, LayerRepository, Mode
|
| 86 |
-
|
| 87 |
-
from transformers.models.gpt_oss.modeling_gpt_oss import GptOssMLP, GptOssRMSNorm
|
| 88 |
-
|
| 89 |
-
replace_kernel_forward_from_hub(GptOssMLP, "Yamoe") # direct, type-safe
|
| 90 |
-
replace_kernel_forward_from_hub(GptOssRMSNorm, None) # direct, type-safe
|
| 91 |
-
custom_mapping = {
|
| 92 |
-
"Yamoe": {
|
| 93 |
-
"cuda": {
|
| 94 |
-
Mode.INFERENCE: LayerRepository(
|
| 95 |
-
repo_id="drbh/yamoe",
|
| 96 |
-
layer_name="Yamoe",
|
| 97 |
-
revision="v0.3.0",
|
| 98 |
-
)
|
| 99 |
-
}
|
| 100 |
-
}
|
| 101 |
-
}
|
| 102 |
-
register_kernel_mapping(custom_mapping)
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
model = GptOssForCausalLM.from_pretrained(
|
| 106 |
-
model_id,
|
| 107 |
-
dtype="bfloat16",
|
| 108 |
-
device_map="auto",
|
| 109 |
-
use_kernels=True,
|
| 110 |
-
quantization_config=quantization_config,
|
| 111 |
-
).eval()
|
| 112 |
-
|
| 113 |
-
messages = [
|
| 114 |
-
{"role": "system", "content": "What is Tensor Parallelism?"},
|
| 115 |
-
]
|
| 116 |
-
|
| 117 |
-
inputs = tokenizer.apply_chat_template(
|
| 118 |
-
messages,
|
| 119 |
-
add_generation_prompt=True,
|
| 120 |
-
return_tensors="pt",
|
| 121 |
-
return_dict=True,
|
| 122 |
-
reasoning_effort="low",
|
| 123 |
-
).to("cuda")
|
| 124 |
-
|
| 125 |
-
max_tokens = 512
|
| 126 |
-
|
| 127 |
-
with torch.inference_mode():
|
| 128 |
-
start_time = time.perf_counter()
|
| 129 |
-
generated = model.generate(
|
| 130 |
-
**inputs,
|
| 131 |
-
max_new_tokens=max_tokens,
|
| 132 |
-
do_sample=False,
|
| 133 |
-
temperature=None,
|
| 134 |
-
)
|
| 135 |
-
end_time = time.perf_counter()
|
| 136 |
-
|
| 137 |
-
print(tokenizer.decode(generated[0], skip_special_tokens=False))
|
| 138 |
-
print(f"Generation took {end_time - start_time:.2f} seconds")
|
| 139 |
-
|
| 140 |
-
```
|
| 141 |
-
|
| 142 |
-
# Reference kernel
|
| 143 |
-
|
| 144 |
-
```python id=setup2
|
| 145 |
-
# /// script
|
| 146 |
-
# requires-python = ">=3.12"
|
| 147 |
-
# dependencies = [
|
| 148 |
-
# "accelerate>=1.10.1",
|
| 149 |
-
# "torch>=2.7.0",
|
| 150 |
-
# "kernels==0.10.0",
|
| 151 |
-
# "transformers@https://github.com/huggingface/transformers.git",
|
| 152 |
-
# "ipdb>=0.13.13",
|
| 153 |
-
# "matplotlib>=3.7.2",
|
| 154 |
-
# "numpy>=1.24.3",
|
| 155 |
-
# ]
|
| 156 |
-
# ///
|
| 157 |
-
|
| 158 |
-
import torch
|
| 159 |
-
from transformers import GptOssForCausalLM, PreTrainedTokenizerFast, Mxfp4Config
|
| 160 |
-
import time
|
| 161 |
-
import torch.nn as nn
|
| 162 |
-
from kernels import register_kernel_mapping, Mode, LayerRepository
|
| 163 |
-
import sys
|
| 164 |
-
import torch.profiler
|
| 165 |
-
import gc
|
| 166 |
-
import logging
|
| 167 |
-
|
| 168 |
-
# set to debug logging
|
| 169 |
-
logging.basicConfig(level=logging.INFO)
|
| 170 |
-
|
| 171 |
-
def reset_peak_memory_stats():
|
| 172 |
-
"""Clear CUDA cache and reset memory allocation counters."""
|
| 173 |
-
torch.cuda.empty_cache()
|
| 174 |
-
if torch.cuda.is_available():
|
| 175 |
-
torch.cuda.reset_peak_memory_stats()
|
| 176 |
-
gc.collect()
|
| 177 |
-
|
| 178 |
-
def get_memory_stats():
|
| 179 |
-
"""Get current and peak CUDA memory usage."""
|
| 180 |
-
if not torch.cuda.is_available():
|
| 181 |
-
return {"allocated_gb": 0, "peak_gb": 0, "reserved_gb": 0}
|
| 182 |
-
return {
|
| 183 |
-
"allocated_gb": torch.cuda.memory_allocated() / 1e9,
|
| 184 |
-
"peak_gb": torch.cuda.max_memory_allocated() / 1e9,
|
| 185 |
-
"reserved_gb": torch.cuda.memory_reserved() / 1e9,
|
| 186 |
-
}
|
| 187 |
-
|
| 188 |
-
def override_kernel_layer_name(cls_name: str, value) -> bool:
|
| 189 |
-
"""Helper to dynamically override the kernel_layer_name in a model class."""
|
| 190 |
-
for mod in sys.modules.values():
|
| 191 |
-
if mod is None:
|
| 192 |
-
continue
|
| 193 |
-
obj = getattr(mod, cls_name, None)
|
| 194 |
-
if isinstance(obj, type) and issubclass(obj, nn.Module):
|
| 195 |
-
setattr(obj, "kernel_layer_name", value)
|
| 196 |
-
print(f"Overrode {cls_name}.kernel_layer_name to {value}")
|
| 197 |
-
return True
|
| 198 |
-
return False
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
# Init the model the normal way
|
| 202 |
-
model_id = "openai/gpt-oss-20b"
|
| 203 |
-
tokenizer = PreTrainedTokenizerFast.from_pretrained(model_id)
|
| 204 |
-
quantization_config = Mxfp4Config(dequantize=True)
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
from kernels import replace_kernel_forward_from_hub, register_kernel_mapping, LayerRepository, Mode
|
| 208 |
-
|
| 209 |
-
from transformers.models.gpt_oss.modeling_gpt_oss import GptOssMLP, GptOssRMSNorm
|
| 210 |
-
|
| 211 |
-
replace_kernel_forward_from_hub(GptOssRMSNorm, None) # direct, type-safe
|
| 212 |
-
custom_mapping = {
|
| 213 |
-
"Yamoe": {
|
| 214 |
-
"cuda": {
|
| 215 |
-
Mode.INFERENCE: LayerRepository(
|
| 216 |
-
repo_id="drbh/yamoe",
|
| 217 |
-
layer_name="Yamoe",
|
| 218 |
-
revision="v0.3.0",
|
| 219 |
-
)
|
| 220 |
-
}
|
| 221 |
-
}
|
| 222 |
-
}
|
| 223 |
-
register_kernel_mapping(custom_mapping)
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
model = GptOssForCausalLM.from_pretrained(
|
| 227 |
-
model_id,
|
| 228 |
-
dtype="bfloat16",
|
| 229 |
-
device_map="auto",
|
| 230 |
-
use_kernels=True,
|
| 231 |
-
quantization_config=quantization_config,
|
| 232 |
-
).eval()
|
| 233 |
-
|
| 234 |
-
messages = [
|
| 235 |
-
{"role": "system", "content": "What is Tensor Parallelism?"},
|
| 236 |
-
]
|
| 237 |
-
|
| 238 |
-
inputs = tokenizer.apply_chat_template(
|
| 239 |
-
messages,
|
| 240 |
-
add_generation_prompt=True,
|
| 241 |
-
return_tensors="pt",
|
| 242 |
-
return_dict=True,
|
| 243 |
-
reasoning_effort="low",
|
| 244 |
-
).to("cuda")
|
| 245 |
-
|
| 246 |
-
max_tokens = 512
|
| 247 |
-
|
| 248 |
-
with torch.inference_mode():
|
| 249 |
-
start_time = time.perf_counter()
|
| 250 |
-
generated = model.generate(
|
| 251 |
-
**inputs,
|
| 252 |
-
max_new_tokens=max_tokens,
|
| 253 |
-
do_sample=False,
|
| 254 |
-
temperature=None,
|
| 255 |
-
)
|
| 256 |
-
end_time = time.perf_counter()
|
| 257 |
-
|
| 258 |
-
print(tokenizer.decode(generated[0], skip_special_tokens=False))
|
| 259 |
-
print(f"Generation took {end_time - start_time:.2f} seconds")
|
| 260 |
-
|
| 261 |
-
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
site/artifacts/charts/benchmark_dashboard.png
DELETED
|
Binary file (87.7 kB)
|
|
|
site/artifacts/charts/latency.png
DELETED
|
Binary file (31.6 kB)
|
|
|
site/artifacts/charts/memory.png
DELETED
|
Binary file (46.3 kB)
|
|
|
site/artifacts/charts/throughput.png
DELETED
|
Binary file (37.4 kB)
|
|
|
site/artifacts/setup/benchmark_avg_tokens_per_sec.txt
DELETED
|
@@ -1 +0,0 @@
|
|
| 1 |
-
5.301658854167735
|
|
|
|
|
|
site/artifacts/setup/benchmark_dashboard.png
DELETED
|
Binary file (92.9 kB)
|
|
|
site/artifacts/setup/benchmark_memory.txt
DELETED
|
@@ -1 +0,0 @@
|
|
| 1 |
-
9.398672896,9.414898176,10.334765056
|
|
|
|
|
|
site/artifacts/setup/benchmark_times.txt
DELETED
|
@@ -1,5 +0,0 @@
|
|
| 1 |
-
12.075035744113848
|
| 2 |
-
12.0710428240709
|
| 3 |
-
12.070115809096023
|
| 4 |
-
12.070908240042627
|
| 5 |
-
12.071364195086062
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
site/cells/charts.py
DELETED
|
@@ -1,140 +0,0 @@
|
|
| 1 |
-
# /// script
|
| 2 |
-
# dependencies = [
|
| 3 |
-
# "matplotlib",
|
| 4 |
-
# "numpy",
|
| 5 |
-
# ]
|
| 6 |
-
# ///
|
| 7 |
-
|
| 8 |
-
import matplotlib.pyplot as plt
|
| 9 |
-
import numpy as np
|
| 10 |
-
import os
|
| 11 |
-
|
| 12 |
-
# get the pathf rom UVNOTE_SETUP env var
|
| 13 |
-
setup_path = os.getenv("UVNOTE_INPUT_SETUP", ".")
|
| 14 |
-
print(f"Reading benchmark data from: {setup_path}")
|
| 15 |
-
|
| 16 |
-
num_runs = 5
|
| 17 |
-
max_tokens = 64
|
| 18 |
-
times = []
|
| 19 |
-
with open(os.path.join(setup_path, "benchmark_times.txt"), "r") as f:
|
| 20 |
-
for line in f:
|
| 21 |
-
times.append(float(line.strip()))
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
avg_time = 0.0
|
| 25 |
-
min_time = 0.0
|
| 26 |
-
max_time = 0.0
|
| 27 |
-
final_mem = {"allocated_gb": 0.0, "peak_gb": 0.0, "reserved_gb": 0.0}
|
| 28 |
-
|
| 29 |
-
avg_tokens_per_sec = 0.0
|
| 30 |
-
with open(os.path.join(setup_path, "benchmark_avg_tokens_per_sec.txt"), "r") as f:
|
| 31 |
-
avg_tokens_per_sec = float(f.read().strip())
|
| 32 |
-
|
| 33 |
-
times_file = os.path.join(setup_path, "benchmark_times.txt")
|
| 34 |
-
memory_file = os.path.join(setup_path, "benchmark_memory.txt")
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
# Minimal brutalist palette (dark theme): grayscale + 1 accent
|
| 38 |
-
ACCENT = '#5ec8f8' # calm cyan-blue accent
|
| 39 |
-
FG = '#e6e6e6' # light gray text/lines
|
| 40 |
-
MUTED = '#9aa0a6' # muted gray for secondary
|
| 41 |
-
GRID = '#333333' # grid lines
|
| 42 |
-
|
| 43 |
-
# Styling tuned for clarity, high contrast, few colors
|
| 44 |
-
plt.style.use('dark_background')
|
| 45 |
-
plt.rcParams['figure.facecolor'] = 'none'
|
| 46 |
-
plt.rcParams['axes.facecolor'] = 'none'
|
| 47 |
-
plt.rcParams['savefig.facecolor'] = 'none'
|
| 48 |
-
plt.rcParams['savefig.transparent'] = True
|
| 49 |
-
plt.rcParams['font.family'] = 'monospace'
|
| 50 |
-
plt.rcParams['font.weight'] = 'bold'
|
| 51 |
-
plt.rcParams['axes.linewidth'] = 3
|
| 52 |
-
plt.rcParams['grid.linewidth'] = 2
|
| 53 |
-
plt.rcParams['lines.linewidth'] = 3
|
| 54 |
-
plt.rcParams['patch.linewidth'] = 2
|
| 55 |
-
|
| 56 |
-
# Prepare data
|
| 57 |
-
runs = list(range(1, len(times) + 1))
|
| 58 |
-
tokens_per_sec_all = [max_tokens / t for t in times]
|
| 59 |
-
|
| 60 |
-
# Chart 1: Throughput Performance
|
| 61 |
-
fig1, ax1 = plt.subplots(1, 1, figsize=(12, 6))
|
| 62 |
-
fig1.patch.set_alpha(0)
|
| 63 |
-
ax1.patch.set_alpha(0)
|
| 64 |
-
|
| 65 |
-
ax1.plot(runs, tokens_per_sec_all, color=ACCENT, marker='o', markersize=12,
|
| 66 |
-
markerfacecolor=ACCENT, markeredgecolor=FG, markeredgewidth=3, linewidth=5, label='tok/s')
|
| 67 |
-
ax1.fill_between(runs, 0, tokens_per_sec_all, alpha=0.2, color=ACCENT)
|
| 68 |
-
ax1.axhline(y=avg_tokens_per_sec, color=FG, linestyle='--', linewidth=3,
|
| 69 |
-
label=f'AVG: {avg_tokens_per_sec:.1f}')
|
| 70 |
-
ax1.set_title('THROUGHPUT PERFORMANCE', color=FG, fontsize=18, pad=20, fontweight='bold')
|
| 71 |
-
ax1.set_xlabel('RUN NUMBER', color=FG, fontsize=14, fontweight='bold')
|
| 72 |
-
ax1.set_ylabel('TOKENS/SEC', color=FG, fontsize=14, fontweight='bold')
|
| 73 |
-
ax1.grid(True, color=GRID, alpha=0.5, linewidth=2)
|
| 74 |
-
ax1.tick_params(colors=FG, labelsize=12)
|
| 75 |
-
legend1 = ax1.legend(frameon=False, loc='lower right')
|
| 76 |
-
for text in legend1.get_texts():
|
| 77 |
-
text.set_color(FG)
|
| 78 |
-
text.set_fontweight('bold')
|
| 79 |
-
plt.tight_layout()
|
| 80 |
-
plt.savefig('throughput.png', dpi=150, bbox_inches='tight', transparent=True)
|
| 81 |
-
plt.show()
|
| 82 |
-
|
| 83 |
-
# Chart 2: Generation Latency
|
| 84 |
-
fig2, ax2 = plt.subplots(1, 1, figsize=(12, 6))
|
| 85 |
-
fig2.patch.set_alpha(0)
|
| 86 |
-
ax2.patch.set_alpha(0)
|
| 87 |
-
|
| 88 |
-
bar_colors = [ACCENT if i % 2 == 0 else MUTED for i in range(len(times))]
|
| 89 |
-
bars = ax2.bar(runs, times, color=bar_colors, edgecolor=FG, linewidth=3, width=0.6)
|
| 90 |
-
ax2.axhline(y=avg_time, color=FG, linestyle='--', linewidth=3,
|
| 91 |
-
label=f'AVG: {avg_time:.2f}s')
|
| 92 |
-
for i, (run, time, bar) in enumerate(zip(runs, times, bars)):
|
| 93 |
-
ax2.text(run, time + 0.02, f'{time:.2f}s', ha='center', va='bottom',
|
| 94 |
-
color=FG, fontweight='bold', fontsize=11)
|
| 95 |
-
ax2.set_title('GENERATION LATENCY', color=FG, fontsize=18, pad=20, fontweight='bold')
|
| 96 |
-
ax2.set_xlabel('RUN NUMBER', color=FG, fontsize=14, fontweight='bold')
|
| 97 |
-
ax2.set_ylabel('TIME (SECONDS)', color=FG, fontsize=14, fontweight='bold')
|
| 98 |
-
ax2.grid(True, axis='y', color=GRID, alpha=0.5, linewidth=2)
|
| 99 |
-
ax2.tick_params(colors=FG, labelsize=12)
|
| 100 |
-
ax2.set_ylim(0, max(times) * 1.15)
|
| 101 |
-
legend2 = ax2.legend(frameon=False, loc='upper right')
|
| 102 |
-
for text in legend2.get_texts():
|
| 103 |
-
text.set_color(FG)
|
| 104 |
-
text.set_fontweight('bold')
|
| 105 |
-
plt.tight_layout()
|
| 106 |
-
plt.savefig('latency.png', dpi=150, bbox_inches='tight', transparent=True)
|
| 107 |
-
plt.show()
|
| 108 |
-
|
| 109 |
-
# Chart 3: Memory Usage
|
| 110 |
-
fig3, ax3 = plt.subplots(1, 1, figsize=(12, 6))
|
| 111 |
-
fig3.patch.set_alpha(0)
|
| 112 |
-
ax3.patch.set_alpha(0)
|
| 113 |
-
|
| 114 |
-
memory_labels = ['ALLOCATED', 'PEAK', 'RESERVED']
|
| 115 |
-
memory_values = [final_mem['allocated_gb'], final_mem['peak_gb'], final_mem['reserved_gb']]
|
| 116 |
-
colors_mem = [MUTED, ACCENT, FG]
|
| 117 |
-
bars = ax3.barh(memory_labels, memory_values, color=colors_mem, edgecolor=FG, linewidth=3, height=0.5)
|
| 118 |
-
for i, (label, value, bar) in enumerate(zip(memory_labels, memory_values, bars)):
|
| 119 |
-
ax3.text(value + 0.5, i, f'{value:.1f} GB', va='center',
|
| 120 |
-
color=FG, fontweight='bold', fontsize=13)
|
| 121 |
-
ax3.set_title('MEMORY USAGE', color=FG, fontsize=18, pad=20, fontweight='bold')
|
| 122 |
-
ax3.set_xlabel('GIGABYTES', color=FG, fontsize=14, fontweight='bold')
|
| 123 |
-
ax3.set_xlim(0, max(memory_values) * 1.3)
|
| 124 |
-
ax3.grid(True, axis='x', color=GRID, alpha=0.5, linewidth=2)
|
| 125 |
-
ax3.tick_params(colors=FG, labelsize=12)
|
| 126 |
-
ax3.set_yticks(range(len(memory_labels)))
|
| 127 |
-
ax3.set_yticklabels(memory_labels, fontweight='bold')
|
| 128 |
-
plt.tight_layout()
|
| 129 |
-
plt.savefig('memory.png', dpi=150, bbox_inches='tight', transparent=True)
|
| 130 |
-
plt.show()
|
| 131 |
-
|
| 132 |
-
print(f"\n📊 Charts saved as:")
|
| 133 |
-
print(f" • throughput.png")
|
| 134 |
-
print(f" • latency.png")
|
| 135 |
-
print(f" • memory.png")
|
| 136 |
-
print(f"\nBenchmark Summary:")
|
| 137 |
-
print(f" avg tokens/sec: {avg_tokens_per_sec:.1f}")
|
| 138 |
-
print(f" min time: {min_time:.3f}s")
|
| 139 |
-
print(f" max time: {max_time:.3f}s")
|
| 140 |
-
print(f" peak memory: {final_mem['peak_gb']:.2f}GB")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
site/cells/forward_and_backward.py
DELETED
|
@@ -1,102 +0,0 @@
|
|
| 1 |
-
# /// script
|
| 2 |
-
# requires-python = ">=3.12"
|
| 3 |
-
# dependencies = [
|
| 4 |
-
# "accelerate>=1.10.1",
|
| 5 |
-
# "torch>=2.7.0",
|
| 6 |
-
# "kernels==0.10.0",
|
| 7 |
-
# "transformers@https://github.com/huggingface/transformers.git",
|
| 8 |
-
# "ipdb>=0.13.13",
|
| 9 |
-
# "matplotlib>=3.7.2",
|
| 10 |
-
# "numpy>=1.24.3",
|
| 11 |
-
# ]
|
| 12 |
-
# ///
|
| 13 |
-
|
| 14 |
-
import torch
|
| 15 |
-
from transformers import GptOssForCausalLM, PreTrainedTokenizerFast, Mxfp4Config
|
| 16 |
-
import time
|
| 17 |
-
import torch.nn as nn
|
| 18 |
-
from kernels import register_kernel_mapping, Mode, LayerRepository
|
| 19 |
-
import sys
|
| 20 |
-
import torch.profiler
|
| 21 |
-
import gc
|
| 22 |
-
import logging
|
| 23 |
-
from transformers.models.gpt_oss.modeling_gpt_oss import GptOssRMSNorm
|
| 24 |
-
|
| 25 |
-
# remove liger kernel for testing
|
| 26 |
-
replace_kernel_forward_from_hub(GptOssRMSNorm, None)
|
| 27 |
-
|
| 28 |
-
# set to debug logging
|
| 29 |
-
logging.basicConfig(level=logging.INFO)
|
| 30 |
-
|
| 31 |
-
def reset_peak_memory_stats():
|
| 32 |
-
"""Clear CUDA cache and reset memory allocation counters."""
|
| 33 |
-
torch.cuda.empty_cache()
|
| 34 |
-
if torch.cuda.is_available():
|
| 35 |
-
torch.cuda.reset_peak_memory_stats()
|
| 36 |
-
gc.collect()
|
| 37 |
-
|
| 38 |
-
def get_memory_stats():
|
| 39 |
-
"""Get current and peak CUDA memory usage."""
|
| 40 |
-
if not torch.cuda.is_available():
|
| 41 |
-
return {"allocated_gb": 0, "peak_gb": 0, "reserved_gb": 0}
|
| 42 |
-
return {
|
| 43 |
-
"allocated_gb": torch.cuda.memory_allocated() / 1e9,
|
| 44 |
-
"peak_gb": torch.cuda.max_memory_allocated() / 1e9,
|
| 45 |
-
"reserved_gb": torch.cuda.memory_reserved() / 1e9,
|
| 46 |
-
}
|
| 47 |
-
|
| 48 |
-
def override_kernel_layer_name(cls_name: str, value) -> bool:
|
| 49 |
-
"""Helper to dynamically override the kernel_layer_name in a model class."""
|
| 50 |
-
for mod in sys.modules.values():
|
| 51 |
-
if mod is None:
|
| 52 |
-
continue
|
| 53 |
-
obj = getattr(mod, cls_name, None)
|
| 54 |
-
if isinstance(obj, type) and issubclass(obj, nn.Module):
|
| 55 |
-
setattr(obj, "kernel_layer_name", value)
|
| 56 |
-
print(f"Overrode {cls_name}.kernel_layer_name to {value}")
|
| 57 |
-
return True
|
| 58 |
-
return False
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
# Init the model the normal way
|
| 62 |
-
model_id = "openai/gpt-oss-20b"
|
| 63 |
-
tokenizer = PreTrainedTokenizerFast.from_pretrained(model_id)
|
| 64 |
-
quantization_config = Mxfp4Config(dequantize=True)
|
| 65 |
-
|
| 66 |
-
model = GptOssForCausalLM.from_pretrained(
|
| 67 |
-
model_id,
|
| 68 |
-
dtype="bfloat16",
|
| 69 |
-
device_map="auto",
|
| 70 |
-
use_kernels=True,
|
| 71 |
-
quantization_config=quantization_config,
|
| 72 |
-
training=True,
|
| 73 |
-
).eval()
|
| 74 |
-
|
| 75 |
-
messages = [
|
| 76 |
-
{"role": "system", "content": "What is Tensor Parallelism?"},
|
| 77 |
-
]
|
| 78 |
-
|
| 79 |
-
inputs = tokenizer.apply_chat_template(
|
| 80 |
-
messages,
|
| 81 |
-
add_generation_prompt=True,
|
| 82 |
-
return_tensors="pt",
|
| 83 |
-
return_dict=True,
|
| 84 |
-
reasoning_effort="low",
|
| 85 |
-
).to("cuda")
|
| 86 |
-
|
| 87 |
-
max_tokens = 512
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
# forward and backward pass
|
| 91 |
-
with torch.autograd.set_grad_enabled(True):
|
| 92 |
-
start_time = time.perf_counter()
|
| 93 |
-
generated = model.generate(
|
| 94 |
-
**inputs,
|
| 95 |
-
max_new_tokens=max_tokens,
|
| 96 |
-
do_sample=False,
|
| 97 |
-
temperature=None,
|
| 98 |
-
)
|
| 99 |
-
end_time = time.perf_counter()
|
| 100 |
-
print(tokenizer.decode(generated[0], skip_special_tokens=False))
|
| 101 |
-
print(f"Generation took {end_time - start_time:.2f} seconds")
|
| 102 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
site/cells/forward_only.py
DELETED
|
@@ -1,96 +0,0 @@
|
|
| 1 |
-
# /// script
|
| 2 |
-
# requires-python = ">=3.12"
|
| 3 |
-
# dependencies = [
|
| 4 |
-
# "accelerate>=1.10.1",
|
| 5 |
-
# "torch>=2.7.0",
|
| 6 |
-
# "kernels==0.10.0",
|
| 7 |
-
# "transformers@https://github.com/huggingface/transformers.git",
|
| 8 |
-
# "ipdb>=0.13.13",
|
| 9 |
-
# "matplotlib>=3.7.2",
|
| 10 |
-
# "numpy>=1.24.3",
|
| 11 |
-
# ]
|
| 12 |
-
# ///
|
| 13 |
-
|
| 14 |
-
import torch
|
| 15 |
-
from transformers import GptOssForCausalLM, PreTrainedTokenizerFast, Mxfp4Config
|
| 16 |
-
import time
|
| 17 |
-
import torch.nn as nn
|
| 18 |
-
from kernels import register_kernel_mapping, Mode, LayerRepository
|
| 19 |
-
import sys
|
| 20 |
-
import torch.profiler
|
| 21 |
-
import gc
|
| 22 |
-
import logging
|
| 23 |
-
|
| 24 |
-
# set to debug logging
|
| 25 |
-
logging.basicConfig(level=logging.INFO)
|
| 26 |
-
|
| 27 |
-
def reset_peak_memory_stats():
|
| 28 |
-
"""Clear CUDA cache and reset memory allocation counters."""
|
| 29 |
-
torch.cuda.empty_cache()
|
| 30 |
-
if torch.cuda.is_available():
|
| 31 |
-
torch.cuda.reset_peak_memory_stats()
|
| 32 |
-
gc.collect()
|
| 33 |
-
|
| 34 |
-
def get_memory_stats():
|
| 35 |
-
"""Get current and peak CUDA memory usage."""
|
| 36 |
-
if not torch.cuda.is_available():
|
| 37 |
-
return {"allocated_gb": 0, "peak_gb": 0, "reserved_gb": 0}
|
| 38 |
-
return {
|
| 39 |
-
"allocated_gb": torch.cuda.memory_allocated() / 1e9,
|
| 40 |
-
"peak_gb": torch.cuda.max_memory_allocated() / 1e9,
|
| 41 |
-
"reserved_gb": torch.cuda.memory_reserved() / 1e9,
|
| 42 |
-
}
|
| 43 |
-
|
| 44 |
-
def override_kernel_layer_name(cls_name: str, value) -> bool:
|
| 45 |
-
"""Helper to dynamically override the kernel_layer_name in a model class."""
|
| 46 |
-
for mod in sys.modules.values():
|
| 47 |
-
if mod is None:
|
| 48 |
-
continue
|
| 49 |
-
obj = getattr(mod, cls_name, None)
|
| 50 |
-
if isinstance(obj, type) and issubclass(obj, nn.Module):
|
| 51 |
-
setattr(obj, "kernel_layer_name", value)
|
| 52 |
-
print(f"Overrode {cls_name}.kernel_layer_name to {value}")
|
| 53 |
-
return True
|
| 54 |
-
return False
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
# Init the model the normal way
|
| 58 |
-
model_id = "openai/gpt-oss-20b"
|
| 59 |
-
tokenizer = PreTrainedTokenizerFast.from_pretrained(model_id)
|
| 60 |
-
quantization_config = Mxfp4Config(dequantize=True)
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
model = GptOssForCausalLM.from_pretrained(
|
| 64 |
-
model_id,
|
| 65 |
-
dtype="bfloat16",
|
| 66 |
-
device_map="auto",
|
| 67 |
-
use_kernels=True,
|
| 68 |
-
quantization_config=quantization_config,
|
| 69 |
-
).eval()
|
| 70 |
-
|
| 71 |
-
messages = [
|
| 72 |
-
{"role": "system", "content": "What is Tensor Parallelism?"},
|
| 73 |
-
]
|
| 74 |
-
|
| 75 |
-
inputs = tokenizer.apply_chat_template(
|
| 76 |
-
messages,
|
| 77 |
-
add_generation_prompt=True,
|
| 78 |
-
return_tensors="pt",
|
| 79 |
-
return_dict=True,
|
| 80 |
-
reasoning_effort="low",
|
| 81 |
-
).to("cuda")
|
| 82 |
-
|
| 83 |
-
max_tokens = 512
|
| 84 |
-
|
| 85 |
-
with torch.inference_mode():
|
| 86 |
-
start_time = time.perf_counter()
|
| 87 |
-
generated = model.generate(
|
| 88 |
-
**inputs,
|
| 89 |
-
max_new_tokens=max_tokens,
|
| 90 |
-
do_sample=False,
|
| 91 |
-
temperature=None,
|
| 92 |
-
)
|
| 93 |
-
end_time = time.perf_counter()
|
| 94 |
-
|
| 95 |
-
print(tokenizer.decode(generated[0], skip_special_tokens=False))
|
| 96 |
-
print(f"Generation took {end_time - start_time:.2f} seconds")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
site/cells/setup.py
DELETED
|
@@ -1,116 +0,0 @@
|
|
| 1 |
-
# /// script
|
| 2 |
-
# requires-python = ">=3.12"
|
| 3 |
-
# dependencies = [
|
| 4 |
-
# "accelerate>=1.10.1",
|
| 5 |
-
# "torch>=2.7.0",
|
| 6 |
-
# "kernels==0.10.0",
|
| 7 |
-
# "transformers@https://github.com/huggingface/transformers.git",
|
| 8 |
-
# "ipdb>=0.13.13",
|
| 9 |
-
# "matplotlib>=3.7.2",
|
| 10 |
-
# "numpy>=1.24.3",
|
| 11 |
-
# ]
|
| 12 |
-
# ///
|
| 13 |
-
|
| 14 |
-
import torch
|
| 15 |
-
from transformers import GptOssForCausalLM, PreTrainedTokenizerFast, Mxfp4Config
|
| 16 |
-
import time
|
| 17 |
-
import torch.nn as nn
|
| 18 |
-
from kernels import register_kernel_mapping, Mode, LayerRepository
|
| 19 |
-
import sys
|
| 20 |
-
import torch.profiler
|
| 21 |
-
import gc
|
| 22 |
-
import logging
|
| 23 |
-
|
| 24 |
-
# set to debug logging
|
| 25 |
-
logging.basicConfig(level=logging.INFO)
|
| 26 |
-
|
| 27 |
-
def reset_peak_memory_stats():
|
| 28 |
-
"""Clear CUDA cache and reset memory allocation counters."""
|
| 29 |
-
torch.cuda.empty_cache()
|
| 30 |
-
if torch.cuda.is_available():
|
| 31 |
-
torch.cuda.reset_peak_memory_stats()
|
| 32 |
-
gc.collect()
|
| 33 |
-
|
| 34 |
-
def get_memory_stats():
|
| 35 |
-
"""Get current and peak CUDA memory usage."""
|
| 36 |
-
if not torch.cuda.is_available():
|
| 37 |
-
return {"allocated_gb": 0, "peak_gb": 0, "reserved_gb": 0}
|
| 38 |
-
return {
|
| 39 |
-
"allocated_gb": torch.cuda.memory_allocated() / 1e9,
|
| 40 |
-
"peak_gb": torch.cuda.max_memory_allocated() / 1e9,
|
| 41 |
-
"reserved_gb": torch.cuda.memory_reserved() / 1e9,
|
| 42 |
-
}
|
| 43 |
-
|
| 44 |
-
def override_kernel_layer_name(cls_name: str, value) -> bool:
|
| 45 |
-
"""Helper to dynamically override the kernel_layer_name in a model class."""
|
| 46 |
-
for mod in sys.modules.values():
|
| 47 |
-
if mod is None:
|
| 48 |
-
continue
|
| 49 |
-
obj = getattr(mod, cls_name, None)
|
| 50 |
-
if isinstance(obj, type) and issubclass(obj, nn.Module):
|
| 51 |
-
setattr(obj, "kernel_layer_name", value)
|
| 52 |
-
print(f"Overrode {cls_name}.kernel_layer_name to {value}")
|
| 53 |
-
return True
|
| 54 |
-
return False
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
# Init the model the normal way
|
| 58 |
-
model_id = "openai/gpt-oss-20b"
|
| 59 |
-
tokenizer = PreTrainedTokenizerFast.from_pretrained(model_id)
|
| 60 |
-
quantization_config = Mxfp4Config(dequantize=True)
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
from kernels import replace_kernel_forward_from_hub, register_kernel_mapping, LayerRepository, Mode
|
| 64 |
-
|
| 65 |
-
from transformers.models.gpt_oss.modeling_gpt_oss import GptOssMLP, GptOssRMSNorm
|
| 66 |
-
|
| 67 |
-
replace_kernel_forward_from_hub(GptOssMLP, "Yamoe") # direct, type-safe
|
| 68 |
-
replace_kernel_forward_from_hub(GptOssRMSNorm, None) # direct, type-safe
|
| 69 |
-
custom_mapping = {
|
| 70 |
-
"Yamoe": {
|
| 71 |
-
"cuda": {
|
| 72 |
-
Mode.INFERENCE: LayerRepository(
|
| 73 |
-
repo_id="drbh/yamoe",
|
| 74 |
-
layer_name="Yamoe",
|
| 75 |
-
revision="v0.3.0",
|
| 76 |
-
)
|
| 77 |
-
}
|
| 78 |
-
}
|
| 79 |
-
}
|
| 80 |
-
register_kernel_mapping(custom_mapping)
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
model = GptOssForCausalLM.from_pretrained(
|
| 84 |
-
model_id,
|
| 85 |
-
dtype="bfloat16",
|
| 86 |
-
device_map="auto",
|
| 87 |
-
use_kernels=True,
|
| 88 |
-
quantization_config=quantization_config,
|
| 89 |
-
).eval()
|
| 90 |
-
|
| 91 |
-
messages = [
|
| 92 |
-
{"role": "system", "content": "What is Tensor Parallelism?"},
|
| 93 |
-
]
|
| 94 |
-
|
| 95 |
-
inputs = tokenizer.apply_chat_template(
|
| 96 |
-
messages,
|
| 97 |
-
add_generation_prompt=True,
|
| 98 |
-
return_tensors="pt",
|
| 99 |
-
return_dict=True,
|
| 100 |
-
reasoning_effort="low",
|
| 101 |
-
).to("cuda")
|
| 102 |
-
|
| 103 |
-
max_tokens = 512
|
| 104 |
-
|
| 105 |
-
with torch.inference_mode():
|
| 106 |
-
start_time = time.perf_counter()
|
| 107 |
-
generated = model.generate(
|
| 108 |
-
**inputs,
|
| 109 |
-
max_new_tokens=max_tokens,
|
| 110 |
-
do_sample=False,
|
| 111 |
-
temperature=None,
|
| 112 |
-
)
|
| 113 |
-
end_time = time.perf_counter()
|
| 114 |
-
|
| 115 |
-
print(tokenizer.decode(generated[0], skip_special_tokens=False))
|
| 116 |
-
print(f"Generation took {end_time - start_time:.2f} seconds")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
site/cells/setup2.py
DELETED
|
@@ -1,115 +0,0 @@
|
|
| 1 |
-
# /// script
|
| 2 |
-
# requires-python = ">=3.12"
|
| 3 |
-
# dependencies = [
|
| 4 |
-
# "accelerate>=1.10.1",
|
| 5 |
-
# "torch>=2.7.0",
|
| 6 |
-
# "kernels==0.10.0",
|
| 7 |
-
# "transformers@https://github.com/huggingface/transformers.git",
|
| 8 |
-
# "ipdb>=0.13.13",
|
| 9 |
-
# "matplotlib>=3.7.2",
|
| 10 |
-
# "numpy>=1.24.3",
|
| 11 |
-
# ]
|
| 12 |
-
# ///
|
| 13 |
-
|
| 14 |
-
import torch
|
| 15 |
-
from transformers import GptOssForCausalLM, PreTrainedTokenizerFast, Mxfp4Config
|
| 16 |
-
import time
|
| 17 |
-
import torch.nn as nn
|
| 18 |
-
from kernels import register_kernel_mapping, Mode, LayerRepository
|
| 19 |
-
import sys
|
| 20 |
-
import torch.profiler
|
| 21 |
-
import gc
|
| 22 |
-
import logging
|
| 23 |
-
|
| 24 |
-
# set to debug logging
|
| 25 |
-
logging.basicConfig(level=logging.INFO)
|
| 26 |
-
|
| 27 |
-
def reset_peak_memory_stats():
|
| 28 |
-
"""Clear CUDA cache and reset memory allocation counters."""
|
| 29 |
-
torch.cuda.empty_cache()
|
| 30 |
-
if torch.cuda.is_available():
|
| 31 |
-
torch.cuda.reset_peak_memory_stats()
|
| 32 |
-
gc.collect()
|
| 33 |
-
|
| 34 |
-
def get_memory_stats():
|
| 35 |
-
"""Get current and peak CUDA memory usage."""
|
| 36 |
-
if not torch.cuda.is_available():
|
| 37 |
-
return {"allocated_gb": 0, "peak_gb": 0, "reserved_gb": 0}
|
| 38 |
-
return {
|
| 39 |
-
"allocated_gb": torch.cuda.memory_allocated() / 1e9,
|
| 40 |
-
"peak_gb": torch.cuda.max_memory_allocated() / 1e9,
|
| 41 |
-
"reserved_gb": torch.cuda.memory_reserved() / 1e9,
|
| 42 |
-
}
|
| 43 |
-
|
| 44 |
-
def override_kernel_layer_name(cls_name: str, value) -> bool:
|
| 45 |
-
"""Helper to dynamically override the kernel_layer_name in a model class."""
|
| 46 |
-
for mod in sys.modules.values():
|
| 47 |
-
if mod is None:
|
| 48 |
-
continue
|
| 49 |
-
obj = getattr(mod, cls_name, None)
|
| 50 |
-
if isinstance(obj, type) and issubclass(obj, nn.Module):
|
| 51 |
-
setattr(obj, "kernel_layer_name", value)
|
| 52 |
-
print(f"Overrode {cls_name}.kernel_layer_name to {value}")
|
| 53 |
-
return True
|
| 54 |
-
return False
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
# Init the model the normal way
|
| 58 |
-
model_id = "openai/gpt-oss-20b"
|
| 59 |
-
tokenizer = PreTrainedTokenizerFast.from_pretrained(model_id)
|
| 60 |
-
quantization_config = Mxfp4Config(dequantize=True)
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
from kernels import replace_kernel_forward_from_hub, register_kernel_mapping, LayerRepository, Mode
|
| 64 |
-
|
| 65 |
-
from transformers.models.gpt_oss.modeling_gpt_oss import GptOssMLP, GptOssRMSNorm
|
| 66 |
-
|
| 67 |
-
replace_kernel_forward_from_hub(GptOssRMSNorm, None) # direct, type-safe
|
| 68 |
-
custom_mapping = {
|
| 69 |
-
"Yamoe": {
|
| 70 |
-
"cuda": {
|
| 71 |
-
Mode.INFERENCE: LayerRepository(
|
| 72 |
-
repo_id="drbh/yamoe",
|
| 73 |
-
layer_name="Yamoe",
|
| 74 |
-
revision="v0.3.0",
|
| 75 |
-
)
|
| 76 |
-
}
|
| 77 |
-
}
|
| 78 |
-
}
|
| 79 |
-
register_kernel_mapping(custom_mapping)
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
model = GptOssForCausalLM.from_pretrained(
|
| 83 |
-
model_id,
|
| 84 |
-
dtype="bfloat16",
|
| 85 |
-
device_map="auto",
|
| 86 |
-
use_kernels=True,
|
| 87 |
-
quantization_config=quantization_config,
|
| 88 |
-
).eval()
|
| 89 |
-
|
| 90 |
-
messages = [
|
| 91 |
-
{"role": "system", "content": "What is Tensor Parallelism?"},
|
| 92 |
-
]
|
| 93 |
-
|
| 94 |
-
inputs = tokenizer.apply_chat_template(
|
| 95 |
-
messages,
|
| 96 |
-
add_generation_prompt=True,
|
| 97 |
-
return_tensors="pt",
|
| 98 |
-
return_dict=True,
|
| 99 |
-
reasoning_effort="low",
|
| 100 |
-
).to("cuda")
|
| 101 |
-
|
| 102 |
-
max_tokens = 512
|
| 103 |
-
|
| 104 |
-
with torch.inference_mode():
|
| 105 |
-
start_time = time.perf_counter()
|
| 106 |
-
generated = model.generate(
|
| 107 |
-
**inputs,
|
| 108 |
-
max_new_tokens=max_tokens,
|
| 109 |
-
do_sample=False,
|
| 110 |
-
temperature=None,
|
| 111 |
-
)
|
| 112 |
-
end_time = time.perf_counter()
|
| 113 |
-
|
| 114 |
-
print(tokenizer.decode(generated[0], skip_special_tokens=False))
|
| 115 |
-
print(f"Generation took {end_time - start_time:.2f} seconds")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
site/megablocks_only.html
DELETED
|
The diff for this file is too large to render.
See raw diff
|
|
|
site/note.html
DELETED
|
The diff for this file is too large to render.
See raw diff
|
|
|
site/note_test_override.html
DELETED
|
The diff for this file is too large to render.
See raw diff
|
|
|
style.css
DELETED
|
@@ -1,28 +0,0 @@
|
|
| 1 |
-
body {
|
| 2 |
-
padding: 2rem;
|
| 3 |
-
font-family: -apple-system, BlinkMacSystemFont, "Arial", sans-serif;
|
| 4 |
-
}
|
| 5 |
-
|
| 6 |
-
h1 {
|
| 7 |
-
font-size: 16px;
|
| 8 |
-
margin-top: 0;
|
| 9 |
-
}
|
| 10 |
-
|
| 11 |
-
p {
|
| 12 |
-
color: rgb(107, 114, 128);
|
| 13 |
-
font-size: 15px;
|
| 14 |
-
margin-bottom: 10px;
|
| 15 |
-
margin-top: 5px;
|
| 16 |
-
}
|
| 17 |
-
|
| 18 |
-
.card {
|
| 19 |
-
max-width: 620px;
|
| 20 |
-
margin: 0 auto;
|
| 21 |
-
padding: 16px;
|
| 22 |
-
border: 1px solid lightgray;
|
| 23 |
-
border-radius: 16px;
|
| 24 |
-
}
|
| 25 |
-
|
| 26 |
-
.card p:last-child {
|
| 27 |
-
margin-bottom: 0;
|
| 28 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|