tokenizer = AutoTokenizer.from_pretrained(model_dir) input_text = "What are we having for dinner?" input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
tokenizer = AutoTokenizer.from_pretrained(model_dir) input_text = "What are we having for dinner?" input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
model = AutoModelForCausalLM.from_pretrained( model_id, torch_dtype=torch.float16, low_cpu_mem_usage=True, quantization_config=quantization_config ).to("cuda")
定义提示并进行分词处理。
1 2 3 4 5 6 7 8 9 10 11 12
prompt = [ {"role": "system", "content": "You are a helpful assistant, that responds as a pirate."}, {"role": "user", "content": "What's Deep Learning?"}, ]
# INSTALLATION # pip install -q --upgrade transformers accelerate optimum # pip install -q --no-build-isolation auto-gptq # REQUIREMENTS # An instance with at least ~210 GiB of total GPU memory when using the 405B model. # The INT4 versions of the 70B and 8B models require ~35 GiB and ~4 GiB, respectively.
import torch from transformers import AutoModelForCausalLM, AutoTokenizer
model_id = "hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4" messages = [ {"role": "system", "content": "You are a pirate"}, {"role": "user", "content": "What's Deep Leaning?"}, ]