diff options
author | Timofey Titovets <nefelim4ag@gmail.com> | 2017-09-28 17:33:37 +0300 |
---|---|---|
committer | David Sterba <dsterba@suse.com> | 2017-11-01 20:45:36 +0100 |
commit | 17b5a6c17e265ca84fac9c3256ff86c691f04aab (patch) | |
tree | 2eb9c8dab73fa0f881a94c67ff7030adc4ec6630 /fs | |
parent | 4e439a0b184f014a5833fa468af36cc9f59b8fb1 (diff) |
Btrfs: heuristic: add bucket and sample counters and other defines
Add basic defines and structures for data sampling.
Added macros:
- For future sampling algo
- For bucket size
Heuristic workspace:
- Add bucket for storing byte type counters
- Add sample array for storing partial copy of input data range
- Add counter for store current sample size to workspace
Signed-off-by: Timofey Titovets <nefelim4ag@gmail.com>
Reviewed-by: David Sterba <dsterba@suse.com>
[ minor coding style fixes, comments updated ]
Signed-off-by: David Sterba <dsterba@suse.com>
Diffstat (limited to 'fs')
-rw-r--r-- | fs/btrfs/compression.c | 53 |
1 files changed, 52 insertions, 1 deletions
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index b155542c5e6b..4f5d958c71ad 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -707,8 +707,47 @@ out: return ret; } +/* + * Heuristic uses systematic sampling to collect data from the input data + * range, the logic can be tuned by the following constants: + * + * @SAMPLING_READ_SIZE - how many bytes will be copied from for each sample + * @SAMPLING_INTERVAL - range from which the sampled data can be collected + */ +#define SAMPLING_READ_SIZE (16) +#define SAMPLING_INTERVAL (256) + +/* + * For statistical analysis of the input data we consider bytes that form a + * Galois Field of 256 objects. Each object has an attribute count, ie. how + * many times the object appeared in the sample. + */ +#define BUCKET_SIZE (256) + +/* + * The size of the sample is based on a statistical sampling rule of thumb. + * The common way is to perform sampling tests as long as the number of + * elements in each cell is at least 5. + * + * Instead of 5, we choose 32 to obtain more accurate results. + * If the data contain the maximum number of symbols, which is 256, we obtain a + * sample size bound by 8192. + * + * For a sample of at most 8KB of data per data range: 16 consecutive bytes + * from up to 512 locations. + */ +#define MAX_SAMPLE_SIZE (BTRFS_MAX_UNCOMPRESSED * \ + SAMPLING_READ_SIZE / SAMPLING_INTERVAL) + +struct bucket_item { + u32 count; +}; struct heuristic_ws { + /* Partial copy of input data */ + u8 *sample; + /* Buckets store counters for each byte value */ + struct bucket_item *bucket; struct list_head list; }; @@ -718,6 +757,8 @@ static void free_heuristic_ws(struct list_head *ws) workspace = list_entry(ws, struct heuristic_ws, list); + kvfree(workspace->sample); + kfree(workspace->bucket); kfree(workspace); } @@ -729,9 +770,19 @@ static struct list_head *alloc_heuristic_ws(void) if (!ws) return ERR_PTR(-ENOMEM); - INIT_LIST_HEAD(&ws->list); + ws->sample = kvmalloc(MAX_SAMPLE_SIZE, GFP_KERNEL); + if (!ws->sample) + goto fail; + + ws->bucket = kcalloc(BUCKET_SIZE, sizeof(*ws->bucket), GFP_KERNEL); + if (!ws->bucket) + goto fail; + INIT_LIST_HEAD(&ws->list); return &ws->list; +fail: + free_heuristic_ws(&ws->list); + return ERR_PTR(-ENOMEM); } struct workspaces_list { |