Target-matched sampling:
A general version of this problem is: "I have a source column, which I want to sample such that it matches the distribution of the target column", where both columns are discrete i.e. categories, not floating-point numbers.
The solution mentioned above requires grouping and sampling from each group.
Here's an alternative, which directly outputs the sampled indexes, and also allows specifying the total size of the resulting sample:
def values_dist(vals: Union[List, Tuple, np.ndarray, pd.Series]) -> pd.Series:
assert isinstance(vals, (list, tuple, np.ndarray, pd.Series))
val_counts: pd.Series = pd.Series(Counter(vals)) ## Includes nan and None as keys.
return val_counts / val_counts.sum()
def sample_idxs_match_distribution(
source: Union[List, Tuple, np.ndarray, pd.Series],
target: Union[List, Tuple, np.ndarray, pd.Series],
n: Optional[int] = None,
seed: Optional[int] = None,
shuffle: bool = True,
target_is_dist: bool = False,
) -> np.ndarray:
"""
Values from current series based on another distribution, and return randomly-shuffled indexes from the source.
Selecting these indexes will give a distribution from the source whicha matches that of the target distribution.
"""
if not target_is_dist:
target_prob_dist: pd.Series = values_dist(target)
else:
target_prob_dist: pd.Series = target
assert isinstance(target_prob_dist, pd.Series)
assert abs(float(target_prob_dist.sum()) - 1.0) <= 1e-2 ## Sum of probs should be exactly or very close to 1.
assert isinstance(source, (list, tuple, np.ndarray, pd.Series))
source_vc: pd.Series = pd.Series(Counter(source))
# print(f'\nsource_vc:\n{source_vc}')
# print(f'\ntarget_prob_dist:\n{target_prob_dist}')
missing_source_vals: Set = set(target_prob_dist.index) - set(source_vc.index)
if len(missing_source_vals) > 0:
raise ValueError(f'Cannot sample; the following values are missing in the source: {missing_source_vals}')
n: int = get_default(n, len(source))
max_n_sample: pd.Series = (source_vc / target_prob_dist).apply(
lambda max_n_sample_category: min(max_n_sample_category, n),
)
# print(f'\n\nmax_n_sample:\n{max_n_sample}')
max_n_sample: int = math.floor(min(max_n_sample.dropna()))
# print(f'Max possible sample size: {max_n_sample}')
source_value_wise_count_to_sample: pd.Series = (target_prob_dist * max_n_sample).round(0).astype(int)
source_value_wise_count_to_sample: Dict[Any, int] = source_value_wise_count_to_sample.to_dict()
## Select random indexes:
source_val_idxs: Dict[Any, List[int]] = {val: [] for val in source_vc.index}
for idx, val in enumerate(source):
if val in source_value_wise_count_to_sample:
source_val_idxs[val].append(idx)
sampled_idxs: np.array = np.array(flatten1d([
random_sample(source_val_idxs[val], n=req_source_val_count, seed=seed)
for val, req_source_val_count in source_value_wise_count_to_sample.items()
]))
if shuffle:
sampled_idxs: np.ndarray = np.random.RandomState(seed).permutation(sampled_idxs)
return sampled_idxs
Usage: taking the largest sample-size possible:
For example:
bias_dist = pd.Series({
"least": 0.277220,
"left": 0.250000,
"right": 0.250000,
"left-center": 0.141244,
"right-center": 0.081536,
})
source = pd.Series(flatten1d([
['least'] * 500,
['left']*300,
['right']*100,
['left-center']*200,
['right-center']*1000,
]))
idxs = sample_idxs_match_distribution(
source,
target=bias_dist,
target_is_dist=True,
)
matched_source = source.iloc[idxs]
print(matched_source.value_counts(normalize=False))
print()
print(matched_source.value_counts(normalize=True))
Output:
least 111
left 100
right 100
left-center 56
right-center 33
dtype: int64
least 0.2775
left 0.2500
right 0.2500
left-center 0.1400
right-center 0.0825
dtype: float64
Usage: Restricting to a certain sample-size:
If you additionally pass n, you can restrict to a certain sample-size:
idxs = sample_idxs_match_distribution(
source,
target=bias_dist,
target_is_dist=True,
n=100,
)
matched_source = source.iloc[idxs]
print(matched_source.value_counts(normalize=False))
print()
print(matched_source.value_counts(normalize=True))
Output:
least 28
right 25
left 25
left-center 14
right-center 8
dtype: int64
least 0.28
right 0.25
left 0.25
left-center 0.14
right-center 0.08
dtype: float64