# Install required packages
!uv add transformers
!uv add unsloth vllm
!uv add faker
In [2]:
In [3]:
# Creating training data for fine-tuning a model on custom data.
= 20000
training_samples !uv run synthetic_data_generator.py {training_samples} > input_dataset/synth_training_data_{training_samples}.jsonl
In [4]:
from unsloth import FastLanguageModel # FastVisionModel for LLMs
import torch
= 2048 # Choose any! We auto support RoPE Scaling internally!
max_seq_length = True # Use 4bit quantization to reduce memory usage. Can be False.
load_in_4bit
# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
= [
fourbit_models "unsloth/Phi-4", # Phi-4 2x faster!
"unsloth/Phi-4-unsloth-bnb-4bit", # Phi-4 Unsloth Dynamic 4-bit Quant
# More models at https://docs.unsloth.ai/get-started/all-our-models
]
= FastLanguageModel.from_pretrained(
model, tokenizer = "unsloth/Phi-4-unsloth-bnb-4bit",
model_name = max_seq_length,
max_seq_length = load_in_4bit,
load_in_4bit # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)
In [5]:
from unsloth.chat_templates import get_chat_template
= get_chat_template(
tokenizer
tokenizer,= "phi-4",
chat_template
)
def formatting_prompts_func(examples):
= examples["conversations"]
convos = [
texts
tokenizer.apply_chat_template(= False, add_generation_prompt = False
convo, tokenize
)for convo in convos
]return { "text" : texts, }
pass
from datasets import load_dataset
# Path to your local JSONL file
= load_dataset("json", data_files=f"input_dataset/synth_training_data_{training_samples}.jsonl", split = "train") dataset
In [6]:
from unsloth.chat_templates import standardize_sharegpt
= standardize_sharegpt(dataset)
dataset = dataset.map(
dataset
formatting_prompts_func,=True,
batched )
In [7]:
0]["conversations"] dataset[
In [8]:
= FastLanguageModel.get_peft_model(
model
model,= 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
r = ["q_proj", "k_proj", "v_proj", "o_proj",
target_modules "gate_proj", "up_proj", "down_proj",],
= 16,
lora_alpha = 0, # Supports any, but = 0 is optimized
lora_dropout = "none", # Supports any, but = "none" is optimized
bias # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
= "unsloth", # True or "unsloth" for very long context
use_gradient_checkpointing = 3407,
random_state = False, # We support rank stabilized LoRA
use_rslora = None, # And LoftQ
loftq_config )
In [9]:
0]["text"] dataset[
In [10]:
from trl import SFTConfig, SFTTrainer
from transformers import DataCollatorForSeq2Seq
= SFTTrainer(
trainer = model,
model = tokenizer,
tokenizer = dataset,
train_dataset = "text",
dataset_text_field = max_seq_length,
max_seq_length = DataCollatorForSeq2Seq(tokenizer = tokenizer),
data_collator = False, # Can make training 5x faster for short sequences.
packing = SFTConfig(
args = 2,
per_device_train_batch_size = 4,
gradient_accumulation_steps = 5,
warmup_steps # num_train_epochs = 1, # Set this for 1 full training run.
= 240,
max_steps = 2e-4,
learning_rate = 1,
logging_steps = "adamw_8bit",
optim = 0.01,
weight_decay = "linear",
lr_scheduler_type = 3407,
seed = "outputs",
output_dir = "none", # Use this for WandB etc
report_to
), )
In [11]:
from unsloth.chat_templates import train_on_responses_only
= train_on_responses_only(
trainer
trainer,="<|im_start|>user<|im_sep|>",
instruction_part="<|im_start|>assistant<|im_sep|>",
response_part )
In [12]:
5]["input_ids"]) tokenizer.decode(trainer.train_dataset[
In [13]:
= tokenizer(" ", add_special_tokens = False).input_ids[0]
space if x == -100 else x for x in trainer.train_dataset[5]["labels"]]) tokenizer.decode([space
In [14]:
# @title Show current memory stats
= torch.cuda.get_device_properties(0)
gpu_stats = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
start_gpu_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
max_memory print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")
In [15]:
= trainer.train() trainer_stats
In [16]:
# @title Show final memory and time stats
= round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory = round(used_memory - start_gpu_memory, 3)
used_memory_for_lora = round(used_memory / max_memory * 100, 3)
used_percentage = round(used_memory_for_lora / max_memory * 100, 3)
lora_percentage print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(
f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training."
)print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")
In [17]:
import shutil
import os
# List of folder paths you want to delete
= ["model", "llama.cpp"]
folders_to_delete
for folder in folders_to_delete:
if os.path.exists(folder) and os.path.isdir(folder):
shutil.rmtree(folder)print(f"Deleted: {folder}")
else:
print(f"Folder not found or not a directory: {folder}")
In [18]:
! git clone --recursive https://github.com/ggerganov/llama.cpp
! cd llama.cpp && cmake -B build && cmake --build build --config Release
In [19]:
# Save to 8bit Q8_0
if True: model.save_pretrained_gguf("model", tokenizer, quantization_method = "q8_0")
# Remember to go to https://huggingface.co/settings/tokens for a token!
# And change hf to your username!
if False: model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "q4_k_m", token = "")
# Save to 16bit GGUF
if False: model.save_pretrained_gguf("model", tokenizer, quantization_method = "f16")
if False: model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "f16", token = "")
# Save to q4_k_m GGUF
if False: model.save_pretrained_gguf("model", tokenizer, quantization_method = "q4_k_m")
if False: model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "q4_k_m", token = "")
# Save to multiple GGUF options - much faster if you want multiple!
if False:
model.push_to_hub_gguf("hf/model", # Change hf to your username!
tokenizer,= ["q4_k_m", "q8_0", "q5_k_m",],
quantization_method = "", # Get a token at https://huggingface.co/settings/tokens
token )
In [20]:
from unsloth.chat_templates import get_chat_template
= get_chat_template(
tokenizer
tokenizer,= "phi-4",
chat_template
)# Enable native 2x faster inference
FastLanguageModel.for_inference(model)
= [
messages "role": "user", "content": "Jeg skal prøve og se hvis den vil svare mitt navn jeevith hegde hvis den kan gjøre det at er det bra modell læring. Do not halucinate with new values"},
{
]= tokenizer.apply_chat_template(
inputs
messages,= True,
tokenize = True, # Must add for generation
add_generation_prompt = "pt",
return_tensors "cuda")
).to(
= model.generate(
outputs = inputs, max_new_tokens = 500, use_cache = True, temperature = 0.0, min_p = 0.0
input_ids
) tokenizer.batch_decode(outputs)
In [21]:
#from google.colab import userdata
# Merge to 16bit
if False: model.save_pretrained_merged("model", tokenizer, save_method = "merged_16bit",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "merged_16bit", token = "")
# Merge to 4bit
if False: model.save_pretrained_merged("model", tokenizer, save_method = "merged_4bit_forced",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "merged_4bit", token = "")
# Just LoRA adapters
if False:
"model")
model.save_pretrained("model")
tokenizer.save_pretrained(if False:
"hf/model", token = "")
model.push_to_hub("hf/model", token = "") tokenizer.push_to_hub(