From 18725395e38e0cb0f77089d07d328b89d07be7a4 Mon Sep 17 00:00:00 2001 From: NexVeridian Date: Fri, 25 Jul 2025 13:13:32 -0700 Subject: [PATCH] update llm --- content/blog/llm-inference-benchmarks.md | 185 ++++++----------------- justfile | 35 +++-- 2 files changed, 64 insertions(+), 156 deletions(-) diff --git a/content/blog/llm-inference-benchmarks.md b/content/blog/llm-inference-benchmarks.md index d427ac4..f14beda 100644 --- a/content/blog/llm-inference-benchmarks.md +++ b/content/blog/llm-inference-benchmarks.md @@ -6,48 +6,49 @@ date = 2025-05-06 tags = ["llm", "benchmarks", "llm-benchmarks", "lm-studio"] +++ -| Size (B) | Speed (T/s) | Model | Type | Quant | Spec Dec (B) | Spec Quant | -|----------|-------------|-----------------|------|--------|--------------|------------| -| 1.5 | 282 | qwen 2.5 | MLX | 4 | - | - | -| 1.5 | 76 | qwen 2.5 | MLX | 8 | - | - | -| 7 | 70 | qwen 2.5 | GUFF | Q4_K_M | - | - | -| 7 | 101 | qwen 2.5 | MLX | 4 | - | - | -| 7 | 58 | qwen 2.5 | MLX | 8 | - | - | -| 12 | 35 | wayfarer | GUFF | Q6_K | - | - | -| 12 | 65 | wayfarer | MLX | 4 | - | - | -| 12 | 45 | wayfarer | MLX | 6 | - | - | -| 12 | 36 | wayfarer | MLX | 8 | - | - | -| 14 | 36 | qwen 2.5 | GUFF | Q4_K_M | - | - | -| 14 | 52 | qwen 2.5 | MLX | 4 | - | - | -| 14 | 55 | qwen 2.5 | MLX | 4 | 1.5 | 4 | -| 14 | 30 | qwen 2.5 | MLX | 8 | - | - | -| 24 | 35 | mistral small 3 | MLX | 4 | - | - | -| 32 | 18 | qwen 2.5 | GUFF | Q4_K_M | - | - | -| 32 | 23 | qwen 2.5 | MLX | 4 | - | - | -| 32 | 30 | qwen 2.5 | MLX | 4 | 1.5 | 4 | -| 32 | 30 | qwen 2.5 | MLX | 4 | 1.5 | 4 | -| 32 | 34 | qwen 2.5 | MLX | 4 | 1.5 | 8 | -| 32 | 26 | qwen 2.5 r1 | MLX | 4 | 1.5 | 4 | -| 32 | 33 | qwen 2.5 coder | MLX | 4 | 1.5 | 4 | -| 32 | 31 | qwen 2.5 coder | MLX | 4 | 3 | 4 | -| 32 | 25 | qwq | MLX | 3 | - | - | -| 32 | 24 | qwq | MLX | 4 | - | - | -| 32 | 18 | qwq | MLX | 4 | 1.5 | 4 | -| 32 | 22 | qwq | MLX | 4 | 1.5 | 8 | -| 32 | 16 | qwq | MLX | 4 | 7 | 4 | -| 32 | 16 | qwq | MLX | 4 | 7 | 8 | -| 32 | 16 | qwq | MLX | 6 | - | - | -| 32 | 16 | qwq | MLX | 6 | 1.5 | 4 | -| 32 | 16 | qwq | MLX | 6 | 1.5 | 8 | -| 70 | 12 | wayfarer large | GUFF | Q2_K_S | - | - | -| 70 | 15 | wayfarer large | MLX | 3 | - | - | -| 30 - A3 | 93 | qwen 3 | MLX | 4 | - | - | -| 30 - A3 | 76 | qwen 3 | MLX | 4 | 1.7 | 4 | -| 30 - A3 | 81 | qwen 3 | MLX | 6 | - | - | -| 30 - A3 | 70 | qwen 3 | MLX | 6 | 1.7 | 4 | -| 30 - A3 | 70 | qwen 3 | MLX | 8 | - | - | -| 32 | 22 | qwen 3 | MLX | 4 | - | - | -| 32 | 26 | qwen 3 | MLX | 4 | 1.7 | 4 | +| Size (B) | Speed (T/s) | Model | Type | Quant | Spec Dec (B) | Spec Quant | +|----------|-------------|---------------------|------|--------|--------------|------------| +| 1.5 | 282 | qwen 2.5 | MLX | 4 | - | - | +| 1.5 | 76 | qwen 2.5 | MLX | 8 | - | - | +| 7 | 70 | qwen 2.5 | GUFF | Q4_K_M | - | - | +| 7 | 101 | qwen 2.5 | MLX | 4 | - | - | +| 7 | 58 | qwen 2.5 | MLX | 8 | - | - | +| 12 | 35 | wayfarer | GUFF | Q6_K | - | - | +| 12 | 65 | wayfarer | MLX | 4 | - | - | +| 12 | 45 | wayfarer | MLX | 6 | - | - | +| 12 | 36 | wayfarer | MLX | 8 | - | - | +| 14 | 36 | qwen 2.5 | GUFF | Q4_K_M | - | - | +| 14 | 52 | qwen 2.5 | MLX | 4 | - | - | +| 14 | 55 | qwen 2.5 | MLX | 4 | 1.5 | 4 | +| 14 | 30 | qwen 2.5 | MLX | 8 | - | - | +| 24 | 35 | mistral small 3 | MLX | 4 | - | - | +| 32 | 18 | qwen 2.5 | GUFF | Q4_K_M | - | - | +| 32 | 23 | qwen 2.5 | MLX | 4 | - | - | +| 32 | 30 | qwen 2.5 | MLX | 4 | 1.5 | 4 | +| 32 | 30 | qwen 2.5 | MLX | 4 | 1.5 | 4 | +| 32 | 34 | qwen 2.5 | MLX | 4 | 1.5 | 8 | +| 32 | 26 | qwen 2.5 r1 | MLX | 4 | 1.5 | 4 | +| 32 | 33 | qwen 2.5 coder | MLX | 4 | 1.5 | 4 | +| 32 | 31 | qwen 2.5 coder | MLX | 4 | 3 | 4 | +| 32 | 25 | qwq | MLX | 3 | - | - | +| 32 | 24 | qwq | MLX | 4 | - | - | +| 32 | 18 | qwq | MLX | 4 | 1.5 | 4 | +| 32 | 22 | qwq | MLX | 4 | 1.5 | 8 | +| 32 | 16 | qwq | MLX | 4 | 7 | 4 | +| 32 | 16 | qwq | MLX | 4 | 7 | 8 | +| 32 | 16 | qwq | MLX | 6 | - | - | +| 32 | 16 | qwq | MLX | 6 | 1.5 | 4 | +| 32 | 16 | qwq | MLX | 6 | 1.5 | 8 | +| 70 | 12 | wayfarer large | GUFF | Q2_K_S | - | - | +| 70 | 15 | wayfarer large | MLX | 3 | - | - | +| 30 - A3 | 93 | qwen 3 | MLX | 4 | - | - | +| 30 - A3 | 76 | qwen 3 | MLX | 4 | 1.7 | 4 | +| 30 - A3 | 81 | qwen 3 | MLX | 6 | - | - | +| 30 - A3 | 70 | qwen 3 | MLX | 6 | 1.7 | 4 | +| 30 - A3 | 70 | qwen 3 | MLX | 8 | - | - | +| 32 | 22 | qwen 3 | MLX | 4 | - | - | +| 32 | 26 | qwen 3 | MLX | 4 | 1.7 | 4 | +| 24 | 18 | Devstral Small 2507 | MLX | 8 | - | - | # mlx convert and upload to huggingface https://huggingface.co/docs/hub/en/mlx @@ -78,103 +79,3 @@ or use https://huggingface.co/spaces/mlx-community/mlx-my-repo | Temp | Min P | Top P | Top K | Repeat P | |------|-------|-------|-------|----------| | 0.7 | 0.00 | 0.80 | 20 | 1.5 | - -## Prompt Template -``` -{%- if tools %} - {{- '<|im_start|>system\n' }} - {%- if messages[0].role == 'system' %} - {{- messages[0].content + '\n\n' }} - {%- endif %} - {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} - {%- for tool in tools %} - {{- "\n" }} - {{- tool | tojson }} - {%- endfor %} - {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} -{%- else %} - {%- if messages[0].role == 'system' %} - {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} - {%- endif %} -{%- endif %} -{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} -{%- for message in messages[::-1] %} - {%- set index = (messages|length - 1) - loop.index0 %} - {%- set tool_start = "" %} - {%- set tool_start_length = tool_start|length %} - {%- set start_of_message = message.content[:tool_start_length] %} - {%- set tool_end = "" %} - {%- set tool_end_length = tool_end|length %} - {%- set start_pos = (message.content|length) - tool_end_length %} - {%- if start_pos < 0 %} - {%- set start_pos = 0 %} - {%- endif %} - {%- set end_of_message = message.content[start_pos:] %} - {%- if ns.multi_step_tool and message.role == "user" and not(start_of_message == tool_start and end_of_message == tool_end) %} - {%- set ns.multi_step_tool = false %} - {%- set ns.last_query_index = index %} - {%- endif %} -{%- endfor %} -{%- for message in messages %} - {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} - {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} - {%- elif message.role == "assistant" %} - {%- set content = message.content %} - {%- set reasoning_content = '' %} - {%- if message.reasoning_content is defined and message.reasoning_content is not none %} - {%- set reasoning_content = message.reasoning_content %} - {%- else %} - {%- if '' in message.content %} - {%- set content = (message.content.split('')|last).lstrip('\n') %} - {%- set reasoning_content = (message.content.split('')|first).rstrip('\n') %} - {%- set reasoning_content = (reasoning_content.split('')|last).lstrip('\n') %} - {%- endif %} - {%- endif %} - {%- if loop.index0 > ns.last_query_index %} - {%- if loop.last or (not loop.last and reasoning_content) %} - {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} - {%- else %} - {{- '<|im_start|>' + message.role + '\n' + content }} - {%- endif %} - {%- else %} - {{- '<|im_start|>' + message.role + '\n' + content }} - {%- endif %} - {%- if message.tool_calls %} - {%- for tool_call in message.tool_calls %} - {%- if (loop.first and content) or (not loop.first) %} - {{- '\n' }} - {%- endif %} - {%- if tool_call.function %} - {%- set tool_call = tool_call.function %} - {%- endif %} - {{- '\n{"name": "' }} - {{- tool_call.name }} - {{- '", "arguments": ' }} - {%- if tool_call.arguments is string %} - {{- tool_call.arguments }} - {%- else %} - {{- tool_call.arguments | tojson }} - {%- endif %} - {{- '}\n' }} - {%- endfor %} - {%- endif %} - {{- '<|im_end|>\n' }} - {%- elif message.role == "tool" %} - {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} - {{- '<|im_start|>user' }} - {%- endif %} - {{- '\n\n' }} - {{- message.content }} - {{- '\n' }} - {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} - {{- '<|im_end|>\n' }} - {%- endif %} - {%- endif %} -{%- endfor %} -{%- if add_generation_prompt %} - {{- '<|im_start|>assistant\n' }} - {%- if enable_thinking is defined and enable_thinking is false %} - {{- '\n\n\n\n' }} - {%- endif %} -{%- endif %} -``` diff --git a/justfile b/justfile index 560f234..5649927 100644 --- a/justfile +++ b/justfile @@ -14,12 +14,17 @@ docker: uv: uv venv - uv pip install huggingface_hub hf_transfer mlx_lm "mlx_lm[quant]" - uv run huggingface-cli login + just uv_install + uv run hf auth login -# just mlx_create "Qwen/Qwen3-30B-A3B" "3 4 5 6 8" "/Users/elijahmcmorris/.cache/lm-studio/models" NexVeridian false true +uv_install: + uv pip install -U huggingface_hub hf_transfer mlx_lm "mlx_lm[train]" tiktoken + # uv pip install -U huggingface_hub hf_transfer "git+https://github.com/ml-explore/mlx-lm@main" "git+https://github.com/ml-explore/mlx-lm@main[train]" + +# just mlx_create "Qwen/Qwen3-30B-A3B" "3 4 5 6 8" "/Users/elijahmcmorris/.cache/lm-studio/models" NexVeridian true true mlx_create hf_url quant lm_studio_path org="mlx-community" upload_repo="false" clean="true": #!/usr/bin/env bash + just uv_install repo_name=$(basename {{hf_url}}) just clean_lmstudio "{{hf_url}}" "{{quant}}" "{{lm_studio_path}}" "{{org}}" @@ -32,6 +37,7 @@ mlx_create hf_url quant lm_studio_path org="mlx-community" upload_repo="false" c --hf-path {{hf_url}} \ -q \ --q-bits ${q} \ + --trust-remote-code \ --upload-repo {{org}}/${repo_name}-${q}bit \ --mlx-path {{lm_studio_path}}/{{org}}/${repo_name}-${q}bit else @@ -39,6 +45,7 @@ mlx_create hf_url quant lm_studio_path org="mlx-community" upload_repo="false" c --hf-path {{hf_url}} \ -q \ --q-bits ${q} \ + --trust-remote-code \ --mlx-path {{lm_studio_path}}/{{org}}/${repo_name}-${q}bit fi @@ -47,41 +54,41 @@ mlx_create hf_url quant lm_studio_path org="mlx-community" upload_repo="false" c fi done -# just mlx_create_dynamic "Qwen/Qwen3-30B-A3B" 4 8 "/Users/elijahmcmorris/.cache/lm-studio/models" NexVeridian false false +# just mlx_create_dynamic "Qwen/Qwen3-14B" 4 8 "/Users/elijahmcmorris/.cache/lm-studio/models" NexVeridian true false # https://github.com/ml-explore/mlx-lm/blob/main/mlx_lm/LEARNED_QUANTS.md mlx_create_dynamic hf_url low high lm_studio_path org="mlx-community" upload_repo="false" clean="true": #!/usr/bin/env bash + just uv_install repo_name=$(basename {{hf_url}}) - rm -r {{lm_studio_path}}/{{org}}/${repo_name}-{{low}}-{{high}}bit || true + rm -r {{lm_studio_path}}/{{org}}/${repo_name}-{{low}}bit-{{high}}bit || true uv run mlx_lm.dynamic_quant \ --model {{hf_url}} \ --low-bits {{low}} \ --high-bits {{high}} \ - --mlx-path {{lm_studio_path}}/{{org}}/${repo_name}-{{low}}-{{high}}bit + --mlx-path {{lm_studio_path}}/{{org}}/${repo_name}-{{low}}bit-{{high}}bit if [[ {{upload_repo}} == "true" ]]; then uv run mlx_lm.upload \ - --path {{lm_studio_path}}/{{org}}/${repo_name}-{{low}}-{{high}}bit \ - --upload-repo {{org}}/${repo_name}-{{low}}-{{high}}bit + --path {{lm_studio_path}}/{{org}}/${repo_name}-{{low}}bit-{{high}}bit \ + --upload-repo {{org}}/${repo_name}-{{low}}bit-{{high}}bit fi if [[ {{clean}} == "true" ]]; then - rm -r {{lm_studio_path}}/{{org}}/${repo_name}-{{low}}-{{high}}bit || true + rm -r {{lm_studio_path}}/{{org}}/${repo_name}-{{low}}bit-{{high}}bit || true fi -# just mlx_create_dwq "Qwen/Qwen3-30B-A3B" "4" "/Users/elijahmcmorris/.cache/lm-studio/models" NexVeridian false false +# just mlx_create_dwq "Qwen/Qwen3-30B-A3B" "4" "/Users/elijahmcmorris/.cache/lm-studio/models" NexVeridian true false # https://github.com/ml-explore/mlx-lm/blob/main/mlx_lm/LEARNED_QUANTS.md +# https://github.com/ml-explore/mlx-lm/blob/main/mlx_lm/quant/dwq.py mlx_create_dwq hf_url quant lm_studio_path org="mlx-community" upload_repo="false" clean="true": #!/usr/bin/env bash + just uv_install repo_name=$(basename {{hf_url}}) teacher_q="8" just clean_lmstudio "{{hf_url}}" "{{quant}}" "{{lm_studio_path}}" "{{org}}" "-DWQ-${teacher_q}bit" - just mlx_create "{{hf_url}}" "${teacher_q}" "{{lm_studio_path}}" "{{org}}" "false" "false" - just clean_lmstudio "{{hf_url}}" "${teacher_q}" "{{lm_studio_path}}" "{{org}}" - for q in {{quant}}; do rm {{lm_studio_path}}/{{org}}/${repo_name}-${q}bit-DWQ @@ -91,7 +98,7 @@ mlx_create_dwq hf_url quant lm_studio_path org="mlx-community" upload_repo="fals --quantized-model {{org}}/${repo_name}-${teacher_q}bit \ --bits ${q} \ --group-size 32 \ - --num-samples 1024 \ + --num-samples 512 \ --batch-size 1 \ --max-seq-length 512 \ --mlx-path {{lm_studio_path}}/{{org}}/${repo_name}-${q}bit-DWQ-${teacher_q}bit