Skip to content

Commit

Permalink
update loading pretrain dataset
Browse files Browse the repository at this point in the history
  • Loading branch information
yangjianxin1 committed Jan 11, 2024
1 parent e4786fa commit 6554f61
Show file tree
Hide file tree
Showing 2 changed files with 4 additions and 6 deletions.
4 changes: 2 additions & 2 deletions train.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ def group_texts(examples):
try:
processed_dataset = datasets.load_from_disk(cache_path, keep_in_memory=False)
logger.info(f'Finished loading datasets-{file_name} from cache')
except:
except Exception:
tmp_cache_path = join(cache_path, 'tmp') # 临时缓存目录,会被自动删除
logger.info(f'There is no cache of file {file_name}, start preprocessing...')
raw_dataset = load_dataset("json", data_files=file, cache_dir=tmp_cache_path, keep_in_memory=False)
Expand All @@ -129,7 +129,7 @@ def group_texts(examples):
processed_dataset = grouped_datasets
processed_dataset.save_to_disk(cache_path)
# 删除临时目录
shutil.rmtree(tmp_cache_path)
# shutil.rmtree(tmp_cache_path)

logger.info(f"Training number of {file_name}: {len(processed_dataset['train'])}")
if idx == 0:
Expand Down
6 changes: 2 additions & 4 deletions train_args/pretrain/full/qwen-7b-pretrain-full.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
"deepspeed": "./train_args/ds_z3_config.json",
"train_file": "./data/pretrain",
"num_train_epochs": 1,
"max_steps": 1000000,
"tokenize_num_workers": 10,
"per_device_train_batch_size": 4,
"gradient_accumulation_steps": 4,
Expand All @@ -13,10 +12,9 @@
"task_type": "pretrain",

"logging_steps": 500,
"save_steps": 10000,
"save_total_limit": 1,
"save_steps": 500,
"lr_scheduler_type": "cosine",
"warmup_steps": 1000,
"warmup_ratio": 0.01,

"gradient_checkpointing": true,
"logging_first_step": false,
Expand Down

0 comments on commit 6554f61

Please sign in to comment.