n_gpu_layers = 32 # Change this value based on your model and your GPU VRAM pool.
n_batch = 256 # Should be between 1 and n_ctx, consider the amount of VRAM in your GPU.
# Loading model,
llm = LlamaCpp(
model_path="/root/.cache/huggingface/hub/models--TheBloke--Llama-2-7B-chat-GGML/snapshots/b616819cd4777514e3a2d9b8be69824aca8f5daf/llama-2-7b-chat.ggmlv3.q5_1.bin",
max_tokens=512,
n_gpu_layers=n_gpu_layers,
n_batch=n_batch,
callback_manager=callback_manager,
n_ctx=4096,
verbose=True,
)
ImportError
Traceback (most recent call last)
/usr/local/lib/python3.10/dist-packages/langchain/llms/llamacpp.py in validate_environment(cls, values)
142 try:
--> 143 from llama_cpp import Llama
144 except ImportError:
ImportError: cannot import name 'LlamaGrammar' from 'llama_cpp' (/usr/local/lib/python3.10/dist-packages/llama_cpp/__init__.py)
During handling of the above exception, another exception occurred: