Skip to content

Commit

Permalink
- index_array_iterator.py added get_sample_number, get_batch_size, se…
Browse files Browse the repository at this point in the history
…t_index_probability methods

- helpers: modified get_histogram bin computation
- improved hard_sample_mining.py callback (fixed memory leak)
- added publish workflow
  • Loading branch information
jeanollion committed Apr 12, 2024
1 parent 5a4a778 commit 10f990f
Show file tree
Hide file tree
Showing 9 changed files with 399 additions and 194 deletions.
2 changes: 0 additions & 2 deletions .github/FUNDING.yml

This file was deleted.

116 changes: 116 additions & 0 deletions .github/workflows/publish-to-pypi.yml
@@ -0,0 +1,116 @@
name: Publish Python distribution 📦 to TestPyPI

on: push

jobs:
build:
name: Build distribution 📦
runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: "3.x"
- name: Install pypa/build
run: >-
python3 -m
pip install
build
--user
- name: Build a binary wheel and a source tarball
run: python3 -m build
- name: Store the distribution packages
uses: actions/upload-artifact@v3
with:
name: python-package-distributions
path: dist/

publish-to-pypi:
name: >-
Publish Python 🐍 distribution 📦 to PyPI
if: startsWith(github.ref, 'refs/tags/') # only publish to PyPI on tag pushes
needs:
- build
runs-on: ubuntu-latest
environment:
name: pypi
url: https://pypi.org/p/dataset-iterator
permissions:
id-token: write # IMPORTANT: mandatory for trusted publishing
steps:
- name: Download all the dists
uses: actions/download-artifact@v3
with:
name: python-package-distributions
path: dist/
- name: Publish distribution 📦 to PyPI
uses: pypa/gh-action-pypi-publish@release/v1

github-release:
name: >-
Sign the Python distribution 📦 with Sigstore
and upload them to GitHub Release
needs:
- publish-to-pypi
runs-on: ubuntu-latest

permissions:
contents: write # IMPORTANT: mandatory for making GitHub Releases
id-token: write # IMPORTANT: mandatory for sigstore

steps:
- name: Download all the dists
uses: actions/download-artifact@v3
with:
name: python-package-distributions
path: dist/
- name: Sign the dists with Sigstore
uses: sigstore/gh-action-sigstore-python@v1.2.3
with:
inputs: >-
./dist/*.tar.gz
./dist/*.whl
- name: Create GitHub Release
env:
GITHUB_TOKEN: ${{ github.token }}
run: >-
gh release create
'${{ github.ref_name }}'
--repo '${{ github.repository }}'
--notes ""
- name: Upload artifact signatures to GitHub Release
env:
GITHUB_TOKEN: ${{ github.token }}
# Upload to GitHub Release using the `gh` CLI.
# `dist/` contains the built packages, and the
# sigstore-produced signatures and certificates.
run: >-
gh release upload
'${{ github.ref_name }}' dist/**
--repo '${{ github.repository }}'
publish-to-testpypi:
name: Publish Python distribution 📦 to TestPyPI
needs:
- build
runs-on: ubuntu-latest

environment:
name: testpypi
url: https://test.pypi.org/p/dataset-iterator

permissions:
id-token: write # IMPORTANT: mandatory for trusted publishing

steps:
- name: Download all the dists
uses: actions/download-artifact@v3
with:
name: python-package-distributions
path: dist/
- name: Publish distribution 📦 to TestPyPI
uses: pypa/gh-action-pypi-publish@release/v1
with:
repository-url: https://test.pypi.org/legacy/
2 changes: 1 addition & 1 deletion dataset_iterator/__init__.py
Expand Up @@ -7,5 +7,5 @@
from .image_data_generator import get_image_data_generator

from .datasetIO import DatasetIO, H5pyIO, MultipleFileIO, MultipleDatasetIO, ConcatenateDatasetIO, MemoryIO
from .active_learning import get_index_probability, compute_loss
from .hard_sample_mining import HardSampleMiningCallback
from .concat_iterator import ConcatIterator
176 changes: 0 additions & 176 deletions dataset_iterator/active_learning.py

This file was deleted.

42 changes: 37 additions & 5 deletions dataset_iterator/concat_iterator.py
@@ -1,3 +1,4 @@
from math import isclose
import numpy as np
from .index_array_iterator import IndexArrayIterator, INCOMPLETE_LAST_BATCH_MODE
from .utils import ensure_multiplicity, ensure_size
Expand All @@ -12,7 +13,23 @@ def __init__(self,
incomplete_last_batch_mode:str=INCOMPLETE_LAST_BATCH_MODE[1],
step_number:int=0):
assert isinstance(iterators, (list, tuple)), "iterators must be either list or tuple"
self.iterators = iterators
self.iterators = []
def append_it(iterator): # unroll concat iterators
if isinstance(iterator, (list, tuple)):
for subit in iterator:
append_it(subit)
elif isinstance(iterator, ConcatIterator):
for subit in iterator.iterators:
append_it(subit)
else:
self.iterators.append(iterator)

append_it(iterators)
bs = [it.get_batch_size() for it in self.iterators]
assert np.all(np.array(bs) == bs[0] ), "all sub iterator batch_size must be equal"
for it in self.iterators:
it.incomplete_last_batch_mode = incomplete_last_batch_mode
self.sub_iterator_batch_size = bs[0]
if proportion is None:
proportion = [1.]
self.proportion = ensure_multiplicity(len(iterators), proportion)
Expand All @@ -29,6 +46,12 @@ def _get_index_array(self, choice:bool = True): # return concatenated indices fo
array = np.random.choice(array, size=array.shape[0], replace=True, p=self.index_probability)
return array

def get_sample_number(self):
return self.it_cumlen[-1]

def get_batch_size(self):
return self.batch_size * self.sub_iterator_batch_size

def _set_index_array(self):
indices_per_iterator = []
for i, it in enumerate(self.iterators):
Expand Down Expand Up @@ -56,16 +79,16 @@ def __len__(self):

def _get_batches_of_transformed_samples(self, index_array):
index_array = np.copy(index_array) # so that main index array is not modified
index_it = self._get_it_idx(index_array) # modifies index_array

index_it = self._get_it_idx(index_array) # modifies index_array so that indices are relative to each iterator
#batches = [self.iterators[it_idx]._get_batches_of_transformed_samples(index_array[index_it==it_idx]) for it_idx in np.unique(index_it)]
batches = [self.iterators[it][i] for i, it in zip(index_array, index_it)]
for i in range(1, len(batches)):
assert len(batches[i])==len(batches[0]), f"Iterators have different outputs: batch from iterator {index_it[0]} has length {len(batches[0])} whereas batch from iterator {index_it[i]} has length {batches[i]}"
# concatenate batches
if len(batches[0]) == 2:
inputs = [b[0] for b in batches]
outputs = [b[1] for b in batches]
return (concat_numpy_arrays(inputs), concat_numpy_arrays(outputs))
return concat_numpy_arrays(inputs), concat_numpy_arrays(outputs)
else:
return concat_numpy_arrays(batches)

Expand Down Expand Up @@ -94,7 +117,16 @@ def disable_random_transforms(self, data_augmentation:bool=True, channels_postpr
def enable_random_transforms(self, parameters):
for it, params in zip(self.iterators, parameters):
it.enable_random_transforms(params)


def set_index_probability(self, value): # set to sub_iterators/ expects a concatenated vector in the order of sub iterators
cur_idx = 0
for it in self.iterators:
size = it.get_sample_number()
proba = value[cur_idx:cur_idx+size]
it.index_probability = proba / np.sum(proba)
cur_idx+=size
assert cur_idx == value.shape[0], f"invalid index_probability length expected: {cur_idx} actual {value.shape[0]}"

def concat_numpy_arrays(arrays):
if isinstance(arrays[0], (list, tuple)):
n = len(arrays[0])
Expand Down

0 comments on commit 10f990f

Please sign in to comment.