divide huggingface dataset into equal sized and uniform classed parts

84 Views Asked by At

I have local image data in which each subfolder contains images of a class, loaded the data with load_dataset .

then i noticed it is very slow in the feature extracting and training process,
so I want to divide the data into 10 parts, each containing N images of every class, and then feed these 10 parts separately to the extractor and trainer.

any suggestions?

btw this is the code: (which is a customized copy of huggingface blog)


DATASET_DIR = '/content/drive/MyDrive'
dataset = load_dataset(name="flowers", path=DATASET_DIR, data_files={"train": "/content/drive/MyDrive/ML learning/flowers2**"})


labels = dataset['train'].features['label'].names

def transform(example_batch):
    # Take a list of PIL images and turn them to pixel values
    inputs = extractor([x for x in example_batch['image']], return_tensors='pt')

    # Don't forget to include the labels!
    inputs['labels'] = example_batch['label']
    return inputs


prepared_ds = dataset.with_transform(transform)

def collate_fn(batch):
    return {
        'pixel_values': torch.stack([x['pixel_values'] for x in batch]),
        'labels': torch.tensor([x['labels'] for x in batch])
    }


extractor = AutoFeatureExtractor.from_pretrained("google/vit-base-patch16-224")
model = AutoModelForImageClassification.from_pretrained("google/vit-base-patch16-224",num_labels=len(labels),ignore_mismatched_sizes=True,
    id2label={str(i): c for i, c in enumerate(labels)},
    label2id={c: str(i) for i, c in enumerate(labels)})
model.classifier = torch.nn.Linear(in_features=model.classifier.in_features, out_features=len(labels)

training_args = TrainingArguments(
    output_dir="./vit-base-flowers-v1",
    per_device_train_batch_size=16,
    evaluation_strategy="steps",
    num_train_epochs=4,
    # fp16=True,
    save_steps=100,
    eval_steps=100,
    logging_steps=10,
    learning_rate=2e-4,
    remove_unused_columns=False,
    push_to_hub=False,
    # report_to='tensorboard',
    load_best_model_at_end=True,
    save_total_limit=2
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=collate_fn,
    # compute_metrics=compute_metrics,
    train_dataset=prepared_ds["train"],
    # eval_dataset=prepared_ds["test"],
    tokenizer=extractor,
)


train_results = trainer.train()


the only thing i tried is to split the data folder manually which is not so automatic {and of course searching the whole net :) }

0

There are 0 best solutions below