Dark Mode

Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Moe-vram for QQQ #2115

Open
Open
Moe-vram for QQQ#2115
Assignees

Description

balanced mode does not work with GLM-4.5-Air and QQQ method, tried on 4 x 3090:

File "/home/ubuntu/venvs/gptqmodelt/lib/python3.13t/site-packages /gptqmodel/looper/module_looper.py", line 1156, in loop
return self._loop_impl(fail_safe=fail_safe, **kwargs)
~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/ubuntu/venvs/gptqmodelt/lib/python3.13t/site-packages /torch/utils/_contextlib.py", line 120, in decorate_context
return func(*args, **kwargs)
File "/home/ubuntu/venvs/gptqmodelt/lib/python3.13t/site-packages /gptqmodel/looper/module_looper.py", line 1497, in _loop_impl
forward_outputs = self._run_forward_batches(
module=module,
...<17 lines>...
preserve_module_devices=preserve_devices,
)
File "/home/ubuntu/venvs/gptqmodelt/lib/python3.13t/site-packages /gptqmodel/looper/module_looper.py", line 526, in _run_forward_batches
return self._run_forward_batches_single(
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^
module=module,
^^^^^^^^^^^^^^
...<16 lines>...
preserve_module_devices=preserve_module_devices,
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
)
^
File "/home/ubuntu/venvs/gptqmodelt/lib/python3.13t/site-packages /gptqmodel/looper/module_looper.py", line 674, in _run_forward_batches_single
module_output = module(*layer_input, **additional_inputs)
File "/home/ubuntu/venvs/gptqmodelt/lib/python3.13t/site-packages /transformers/modeling_layers.py", line 94, in __call__
return super().__call__(*args, **kwargs)
~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^
File "/home/ubuntu/venvs/gptqmodelt/lib/python3.13t/site-packages /torch/nn/modules/module.py", line 1775, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^
File "/home/ubuntu/venvs/gptqmodelt/lib/python3.13t/site-packages /torch/nn/modules/module.py", line 1786, in _call_impl
return forward_call(*args, **kwargs)
File "/home/ubuntu/venvs/gptqmodelt/lib/python3.13t/site-packages /transformers/utils/deprecation.py", line 172, in wrapped_func
return func(*args, **kwargs)
File "/home/ubuntu/venvs/gptqmodelt/lib/python3.13t/site-packages /transformers/models/glm4_moe/modeling_glm4_moe.py", line 395, in forward
hidden_states = self.mlp(hidden_states)
File "/home/ubuntu/venvs/gptqmodelt/lib/python3.13t/site-packages /torch/nn/modules/module.py", line 1775, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^
File "/home/ubuntu/venvs/gptqmodelt/lib/python3.13t/site-packages /torch/nn/modules/module.py", line 1786, in _call_impl
return forward_call(*args, **kwargs)
File "/home/ubuntu/venvs/gptqmodelt/lib/python3.13t/site-packages /transformers/models/glm4_moe/modeling_glm4_moe.py", line 345, in forward
hidden_states = self.moe(hidden_states, topk_indices, topk_weights).view(*orig_shape)
~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/ubuntu/venvs/gptqmodelt/lib/python3.13t/site-packages /transformers/models/glm4_moe/modeling_glm4_moe.py", line 331, in moe
expert_output = expert(expert_input)
File "/home/ubuntu/venvs/gptqmodelt/lib/python3.13t/site-packages /torch/nn/modules/module.py", line 1775, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^
File "/home/ubuntu/venvs/gptqmodelt/lib/python3.13t/site-packages /torch/nn/modules/module.py", line 1786, in _call_impl
return forward_call(*args, **kwargs)
File "/home/ubuntu/venvs/gptqmodelt/lib/python3.13t/site-packages /transformers/models/glm4_moe/modeling_glm4_moe.py", line 223, in forward
down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
~~~~~~~~~~~~~~^^^
File "/home/ubuntu/venvs/gptqmodelt/lib/python3.13t/site-packages /torch/nn/modules/module.py", line 1775, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^
File "/home/ubuntu/venvs/gptqmodelt/lib/python3.13t/site-packages /torch/nn/modules/module.py", line 1786, in _call_impl
return forward_call(*args, **kwargs)
File "/home/ubuntu/venvs/gptqmodelt/lib/python3.13t/site-packages /torch/utils/_contextlib.py", line 120, in decorate_context
return func(*args, **kwargs)
File "/home/ubuntu/venvs/gptqmodelt/lib/python3.13t/site-packages /gptqmodel/nn_modules/hooked_linear.py", line 239, in forward
self.forward_hook(self, (input,), output)
~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/ubuntu/venvs/gptqmodelt/lib/python3.13t/site-packages /gptqmodel/looper/module_looper.py", line 954, in hook
return inner_hook(module, new_inputs, new_output)
File "/home/ubuntu/venvs/gptqmodelt/lib/python3.13t/site-packages /gptqmodel/looper/qqq_processor.py", line 100, in tmp
q.add_batch(inp[0].data, out.data) # noqa: F821
~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^
File "/home/ubuntu/venvs/gptqmodelt/lib/python3.13t/site-packages /gptqmodel/quantization/qqq.py", line 290, in add_batch
self.H += inp.matmul(inp.t())
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cuda:1!

--- update:
looks like vllm does not support qqq right now

Metadata

Metadata

Assignees

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions