Revisions to

deleted 2 characters in body

Source Link

edited Oct 4 at 19:45

244
1
5

use deflate::deflate_bytes;

mod pcg32;
use pcg32::Pcg32;
mod seed;
use seed::generate_seed;
mod shannon_entropy;
use shannon_entropy::get_shannon_entropy_from_distribution;

// Use a Pcg32 to generate a random value between 0 and `max_value` inclusive.
// Multiple calls generate a uniform distribution in this range while avoiding
// modulo bias.
fn sample_uniform(rng: &mut Pcg32, max_value: u32) -> u32 {
    if max_value == u32::MAX {
        // Full range, no need to debias.
        return rng.sample();
    }

    // Use Debiased Modulo (Twice), OpenBSD’s method
    // as described in <https://www.pcg-random.org/posts/bounded-rands.html>
    let upper_bound = max_value + 1;
    let min_raw_sample = 0_u32.wrapping_sub(upper_bound) % upper_bound;
    loop {
        let raw_sample = rng.sample();
        if raw_sample >= min_raw_sample {
            return raw_sample % upper_bound;
        }
    }
}

#[derive(Copy, Clone, Debug)]
struct RngEvaluationResult {
    shannon_entropy: f64,
    compression_ratio: f64,
}

fn evaluate_rng(rng: &mut Pcg32, sequence_len: usize) -> RngEvaluationResult {
    // We store the generated values in a packed sequence
    // in order to measure the compression ratio of the generated sequence
    // itself without introducing extra bits at 0 due to the storage.
    // The equation 101**n = 256**(n-1)
    // solves to n = ceil(1 / (1 - log2(101) / 8)) = 6
    // which means that we can store 6 values each between 0-100 inclusive
    // inside 5 bytes.
    let mut packed_sequence =
        Vec::<u8>::with_capacity((5 * sequence_len).div_ceil(6));
    // We use a 64 bits accumulator
    // to compute sum_{0<=k<=5}(v_k * 101**k) < 101**6 which fit inside 64 bits,
    // before storing its lower 40 bits into 5 bytes of the packed sequence.
    let mut accumulator: u64 = 0;
    // We do not use Horner method in order to keep the order of the sequence.
    let muls = [
        1,
        101,
        101 * 101,
        101 * 101 * 101,
        101 * 101 * 101 * 101,
        101 * 101 * 101 * 101 * 101,
    ];
    let mut k: usize = 0;

    // store the accumulated 5 bytes into dst vector
    fn store_acc(acc: u64, dst: &mut Vec<u8>) {
        for i in 0..5 {
            dst.push((acc >> (8 * i)) as u8);
        }
    }

    let mut distribution = vec![0_usize; 101];

    for _ in 0..sequence_len {
        let val = sample_uniform(rng, 100);
        distribution[val as usize] += 1;

        accumulator += val as u64 * muls[k];
        k += 1;
        if k == 6 {
            store_acc(accumulator, &mut packed_sequence);
            accumulator = 0;
            k = 0;
        }
    }
    if k > 0 {
        store_acc(accumulator, &mut packed_sequence);
    }

    RngEvaluationResult {
        shannon_entropy: get_shannon_entropy_from_distribution(&distribution),
        compression_ratio: deflate_bytes(&packed_sequence).len() as f64
            / packed_sequence.len() as f64,
    }
}

fn main() {
    let seed = generate_seed();
    let mut rng = Pcg32::new(seed);

    let ret = evaluate_rng(&mut rng, 100_000);

    println!("Shannon entropy:   {:.6}", ret.shannon_entropy);
    println!("Compression ratio: {:.6}", ret.compression_ratio);
}

use deflate::deflate_bytes;

mod pcg32;
use pcg32::Pcg32;
mod seed;
use seed::generate_seed;
mod shannon_entropy;
use shannon_entropy::get_shannon_entropy_from_distribution;

// Use a Pcg32 to generate a random value between 0 and `max_value` inclusive.
// Multiple calls generate a uniform distribution in this range while avoiding
// modulo bias.
fn sample_uniform(rng: &mut Pcg32, max_value: u32) -> u32 {
    if max_value == u32::MAX {
        // Full range, no need to debias.
        return rng.sample();
    }

    // Use Debiased Modulo (Twice), OpenBSD’s method
    // as described in <https://www.pcg-random.org/posts/bounded-rands.html>
    let upper_bound = max_value + 1;
    let min_raw_sample = 0_u32.wrapping_sub(upper_bound) % upper_bound;
    loop {
        let raw_sample = rng.sample();
        if raw_sample >= min_raw_sample {
            return raw_sample % upper_bound;
        }
    }
}

#[derive(Copy, Clone, Debug)]
struct RngEvaluationResult {
    shannon_entropy: f64,
    compression_ratio: f64,
}

fn evaluate_rng(rng: &mut Pcg32, sequence_len: usize) -> RngEvaluationResult {
    // We store the generated values in a packed sequence
    // in order to measure the compression ratio of the generated sequence
    // itself without introducing extra bits at 0 due to the storage.
    // The equation 101**n = 256**(n-1)
    // solves to n = ceil(1 / (1 - log2(101) / 8)) = 6
    // which means that we can store 6 values each between 0-100 inclusive
    // inside 5 bytes.
    let mut packed_sequence =
        Vec::<u8>::with_capacity((5 * sequence_len).div_ceil(6));
    // We use a 64 bits accumulator
    // to compute sum_{0<=k<=5}(v_k * 101**k) < 101**6 which fit inside 64 bits,
    // before storing its lower 40 bits into 5 bytes of the packed sequence.
    let mut accumulator: u64 = 0;
    // We do not use Horner method in order to keep the order of the sequence.
    let muls = [
        1,
        101,
        101 * 101,
        101 * 101 * 101,
        101 * 101 * 101 * 101,
        101 * 101 * 101 * 101 * 101,
    ];
    let mut k: usize = 0;

    // store the accumulated 5 bytes into dst vector
    fn store_acc(acc: u64, dst: &mut Vec<u8>) {
        for i in 0..5 {
            dst.push((acc >> (8 * i)) as u8);
        }
    }

    let mut distribution = vec![0_usize; 101];

    for _ in 0..sequence_len {
        let val = sample_uniform(rng, 100);
        distribution[val as usize] += 1;

        accumulator += val as u64 * muls[k];
        k += 1;
        if k == 6 {
            store_acc(accumulator, &mut packed_sequence);
            accumulator = 0;
            k = 0;
        }
    }
    if k > 0 {
        store_acc(accumulator, &mut packed_sequence);
    }

    RngEvaluationResult {
        shannon_entropy: get_shannon_entropy_from_distribution(&distribution),
        compression_ratio: deflate_bytes(&packed_sequence).len() as f64
            / packed_sequence.len() as f64,
    }
}

fn main() {
    let seed = generate_seed();
    let mut rng = Pcg32::new(seed);

    let ret = evaluate_rng(&mut rng, 100_000);

    println!("Shannon entropy:   {:.6}", ret.shannon_entropy);
    println!("Compression ratio: {:.6}", ret.compression_ratio);
}

use deflate::deflate_bytes;

mod pcg32;
use pcg32::Pcg32;
mod seed;
use seed::generate_seed;
mod shannon_entropy;
use shannon_entropy::get_shannon_entropy_from_distribution;

// Use a Pcg32 to generate a random value between 0 and `max_value` inclusive.
// Multiple calls generate a uniform distribution in this range while avoiding
// modulo bias.
fn sample_uniform(rng: &mut Pcg32, max_value: u32) -> u32 {
    if max_value == u32::MAX {
        // Full range, no need to debias.
        return rng.sample();
    }

    // Use Debiased Modulo (Twice), OpenBSD’s method
    // as described in <https://www.pcg-random.org/posts/bounded-rands.html>
    let upper_bound = max_value + 1;
    let min_raw_sample = 0_u32.wrapping_sub(upper_bound) % upper_bound;
    loop {
        let raw_sample = rng.sample();
        if raw_sample >= min_raw_sample {
            return raw_sample % upper_bound;
        }
    }
}

#[derive(Copy, Clone, Debug)]
struct RngEvaluationResult {
    shannon_entropy: f64,
    compression_ratio: f64,
}

fn evaluate_rng(rng: &mut Pcg32, sequence_len: usize) -> RngEvaluationResult {
    // We store the generated values in a packed sequence
    // in order to measure the compression ratio of the generated sequence
    // itself without introducing extra bits at 0 due to the storage.
    // The equation 101**n = 256**(n-1)
    // solves to n = ceil(1 / (1 - log2(101) / 8)) = 6
    // which means that we can store 6 values each between 0-100 inclusive
    // inside 5 bytes.
    let mut packed_sequence =
        Vec::<u8>::with_capacity(5 * sequence_len.div_ceil(6));
    // We use a 64 bits accumulator
    // to compute sum_{0<=k<=5}(v_k * 101**k) < 101**6 which fit inside 64 bits,
    // before storing its lower 40 bits into 5 bytes of the packed sequence.
    let mut accumulator: u64 = 0;
    // We do not use Horner method in order to keep the order of the sequence.
    let muls = [
        1,
        101,
        101 * 101,
        101 * 101 * 101,
        101 * 101 * 101 * 101,
        101 * 101 * 101 * 101 * 101,
    ];
    let mut k: usize = 0;

    // store the accumulated 5 bytes into dst vector
    fn store_acc(acc: u64, dst: &mut Vec<u8>) {
        for i in 0..5 {
            dst.push((acc >> (8 * i)) as u8);
        }
    }

    let mut distribution = vec![0_usize; 101];

    for _ in 0..sequence_len {
        let val = sample_uniform(rng, 100);
        distribution[val as usize] += 1;

        accumulator += val as u64 * muls[k];
        k += 1;
        if k == 6 {
            store_acc(accumulator, &mut packed_sequence);
            accumulator = 0;
            k = 0;
        }
    }
    if k > 0 {
        store_acc(accumulator, &mut packed_sequence);
    }

    RngEvaluationResult {
        shannon_entropy: get_shannon_entropy_from_distribution(&distribution),
        compression_ratio: deflate_bytes(&packed_sequence).len() as f64
            / packed_sequence.len() as f64,
    }
}

fn main() {
    let seed = generate_seed();
    let mut rng = Pcg32::new(seed);

    let ret = evaluate_rng(&mut rng, 100_000);

    println!("Shannon entropy:   {:.6}", ret.shannon_entropy);
    println!("Compression ratio: {:.6}", ret.compression_ratio);
}

deleted 1 character in body

Source Link

edited Oct 3 at 20:52

Jérôme Migné

244
1
5

The more important challenge was to compute the compression ratio on a byte sequence that don’t introducewithout introducing extra pattern due to the storage itself which leads to a lower measurement than on raw consecutive data, see below.

added 644 characters in body

Source Link

edited Oct 3 at 20:39

Jérôme Migné

244
1
5

The more important challenge was to compute the compression ratio on a byte sequence that don’t introduce extra pattern due to the storage itself which leads to a lower measurement than on raw consecutive data, see below.

NB: In the first iteration I obtained a compression ratio of 0.843 which was not as high as expected, but this was because the values are generated in the inclusive range 0-100, so the high order bit of each byte of the generated sequence is always 0. To check this not so high compression ratio was due to the storage/encoding of the sequence, I generated the values in the inclusive range 0-255 instead for which I obtained a compression ratio slightly higher than 1.0. Then to compute the compression ratio on the generated sequence itself without adding extra bits for storage I stored each sequence of 6 consecutive values between 0 and 100 inclusive into 5 consecutive bytes. Because the equation 101**n = 256**(n-1) solves to n = ceil(1 / (1 - log2(101) / 8)) = 6 so that the number sum_{0<=k<=5}(v_k * 101**k) accumulating 6 consecutive values < 101 can be stored into 5 bytes.

NB: In the first iteration I obtained a compression ratio of 0.843 which was not as high as expected, but this was because the values are generated in the inclusive range 0-100, so the high order bit of each byte of the generated sequence is always 0. To check this not so high compression ratio was due to the storage/encoding of the sequence, I generated the values in the inclusive range 0-255 instead for which I obtained a compression ratio slightly higher than 1.0.

The more important challenge was to compute the compression ratio on a byte sequence that don’t introduce extra pattern due to the storage itself which leads to a lower measurement than on raw consecutive data, see below.

NB: In the first iteration I obtained a compression ratio of 0.843 which was not as high as expected, but this was because the values are generated in the inclusive range 0-100, so the high order bit of each byte of the generated sequence is always 0. To check this not so high compression ratio was due to the storage/encoding of the sequence, I generated the values in the inclusive range 0-255 instead for which I obtained a compression ratio slightly higher than 1.0. Then to compute the compression ratio on the generated sequence itself without adding extra bits for storage I stored each sequence of 6 consecutive values between 0 and 100 inclusive into 5 consecutive bytes. Because the equation 101**n = 256**(n-1) solves to n = ceil(1 / (1 - log2(101) / 8)) = 6 so that the number sum_{0<=k<=5}(v_k * 101**k) accumulating 6 consecutive values < 101 can be stored into 5 bytes.

added 1227 characters in body; added 49 characters in body

Source Link

edited Oct 3 at 20:21

Jérôme Migné

244
1
5

Loading

deleted 107 characters in body

Source Link

edited Oct 3 at 17:59

Jérôme Migné

244
1
5

Loading

edited body

Source Link

edited Oct 3 at 6:19

Jérôme Migné

244
1
5

Loading

added 577 characters in body; added 38 characters in body

Source Link

edited Oct 3 at 6:12

Jérôme Migné

244
1
5

Loading

added 1 character in body

Source Link

edited Oct 2 at 22:46

Jérôme Migné

244
1
5

Loading

deleted 1 character in body; added 9 characters in body; added 9 characters in body

Source Link

edited Oct 2 at 22:33

Jérôme Migné

244
1
5

Loading

added 302 characters in body

Source Link

edited Oct 2 at 22:28

Jérôme Migné

244
1
5

Loading

added 779 characters in body

Source Link

edited Oct 2 at 22:22

Jérôme Migné

244
1
5

Loading

Source Link

created Oct 2 at 21:56

Jérôme Migné

244
1
5

Loading

Collectives™ on Stack Overflow

Return to discussion