aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs/raid56.h
blob: 1f463ecf7e41761b5785e9e0bccbf9aa426c49d0 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Copyright (C) 2012 Fusion-io  All rights reserved.
 * Copyright (C) 2012 Intel Corp. All rights reserved.
 */

#ifndef BTRFS_RAID56_H
#define BTRFS_RAID56_H

#include <linux/types.h>
#include <linux/list.h>
#include <linux/spinlock.h>
#include <linux/bio.h>
#include <linux/refcount.h>
#include <linux/workqueue.h>
#include "volumes.h"

struct page;
struct btrfs_fs_info;

enum btrfs_rbio_ops {
	BTRFS_RBIO_WRITE,
	BTRFS_RBIO_READ_REBUILD,
	BTRFS_RBIO_PARITY_SCRUB,
};

/*
 * Overview of btrfs_raid_bio.
 *
 * One btrfs_raid_bio represents a full stripe of RAID56, including both data
 * and P/Q stripes. For now, each data and P/Q stripe is of a fixed length (64K).
 *
 * One btrfs_raid_bio can have one or more bios from higher layer, covering
 * part or all of the data stripes.
 *
 * [PAGES FROM HIGHER LAYER BIOS]
 * Higher layer bios are in the btrfs_raid_bio::bio_list.
 *
 * Pages from the bio_list are represented like the following:
 *
 * bio_list:	     |<- Bio 1 ->|             |<- Bio 2 ->|  ...
 * bio_paddrs:	    [0]   [1]   [2]    [3]    [4]    [5]      ...
 *
 * If there is a bio covering a sector (one btrfs fs block), the corresponding
 * pointer in btrfs_raid_bio::bio_paddrs[] will point to the physical address
 * (with the offset inside the page) of the corresponding bio.
 *
 * If there is no bio covering a sector, then btrfs_raid_bio::bio_paddrs[i] will
 * be INVALID_PADDR.
 *
 * The length of each entry in bio_paddrs[] is a step (aka, min(sectorsize, PAGE_SIZE)).
 *
 * [PAGES FOR INTERNAL USAGES]
 * Pages not covered by any bio or belonging to P/Q stripes are stored in
 * btrfs_raid_bio::stripe_pages[] and stripe_paddrs[], like the following:
 *
 * stripe_pages:       |<- Page 0 ->|<- Page 1 ->|  ...
 * stripe_paddrs:     [0]    [1]   [2]    [3]   [4] ...
 *
 * stripe_pages[] array stores all the pages covering the full stripe, including
 * data and P/Q pages.
 * stripe_pages[0] is the first page of the first data stripe.
 * stripe_pages[BTRFS_STRIPE_LEN / PAGE_SIZE] is the first page of the second
 * data stripe.
 *
 * Some pointers inside stripe_pages[] can be NULL, e.g. for a full stripe write
 * (the bio covers all data stripes) there is no need to allocate pages for
 * data stripes (can grab from bio_paddrs[]).
 *
 * If the corresponding page of stripe_paddrs[i] is not allocated, the value of
 * stripe_paddrs[i] will be INVALID_PADDR.
 *
 * The length of each entry in stripe_paddrs[] is a step.
 *
 * [LOCATING A SECTOR]
 * To locate a sector for IO, we need the following info:
 *
 * - stripe_nr
 *   Starts from 0 (representing the first data stripe), ends at
 *   @nr_data (RAID5, P stripe) or @nr_data + 1 (RAID6, Q stripe).
 *
 * - sector_nr
 *   Starts from 0 (representing the first sector of the stripe), ends
 *   at BTRFS_STRIPE_LEN / sectorsize - 1.
 *
 * - step_nr
 *   A step is min(sector_size, PAGE_SIZE).
 *
 *   Starts from 0 (representing the first step of the sector), ends
 *   at @sector_nsteps - 1.
 *
 *   For most call sites they do not need to bother this parameter.
 *   It is for bs > ps support and only for vertical stripe related works.
 *   (e.g. RMW/recover)
 *
 * - from which array
 *   Whether grabbing from stripe_paddrs[] (aka, internal pages) or from the
 *   bio_paddrs[] (aka, from the higher layer bios).
 *
 * For IO, a physical address is returned, so that we can extract the page and
 * the offset inside the page for IO.
 * A special value INVALID_PADDR represents when the physical address is invalid,
 * normally meaning there is no page allocated for the specified sector.
 */
struct btrfs_raid_bio {
	struct btrfs_io_context *bioc;

	/*
	 * While we're doing RMW on a stripe we put it into a hash table so we
	 * can lock the stripe and merge more rbios into it.
	 */
	struct list_head hash_list;

	/* LRU list for the stripe cache */
	struct list_head stripe_cache;

	/* For scheduling work in the helper threads */
	struct work_struct work;

	/*
	 * bio_list and bio_list_lock are used to add more bios into the stripe
	 * in hopes of avoiding the full RMW
	 */
	struct bio_list bio_list;
	spinlock_t bio_list_lock;

	/*
	 * Also protected by the bio_list_lock, the plug list is used by the
	 * plugging code to collect partial bios while plugged.  The stripe
	 * locking code also uses it to hand off the stripe lock to the next
	 * pending IO.
	 */
	struct list_head plug_list;

	/* Flags that tell us if it is safe to merge with this bio. */
	unsigned long flags;

	/*
	 * Set if we're doing a parity rebuild for a read from higher up, which
	 * is handled differently from a parity rebuild as part of RMW.
	 */
	enum btrfs_rbio_ops operation;

	/* How many pages there are for the full stripe including P/Q */
	u16 nr_pages;

	/* How many sectors there are for the full stripe including P/Q */
	u16 nr_sectors;

	/* Number of data stripes (no p/q) */
	u8 nr_data;

	/* Number of all stripes (including P/Q) */
	u8 real_stripes;

	/* How many pages there are for each stripe */
	u8 stripe_npages;

	/* How many sectors there are for each stripe */
	u8 stripe_nsectors;

	/*
	 * How many steps there are for one sector.
	 *
	 * For bs > ps cases, it's sectorsize / PAGE_SIZE.
	 * For bs <= ps cases, it's always 1.
	 */
	u8 sector_nsteps;

	/* Stripe number that we're scrubbing  */
	u8 scrubp;

	/*
	 * Size of all the bios in the bio_list.  This helps us decide if the
	 * rbio maps to a full stripe or not.
	 */
	int bio_list_bytes;

	refcount_t refs;

	atomic_t stripes_pending;

	wait_queue_head_t io_wait;

	/* Bitmap to record which horizontal stripe has data */
	unsigned long dbitmap;

	/* Allocated with stripe_nsectors-many bits for finish_*() calls */
	unsigned long finish_pbitmap;

	/*
	 * These are two arrays of pointers.  We allocate the rbio big enough
	 * to hold them both and setup their locations when the rbio is
	 * allocated.
	 */

	/*
	 * Pointers to pages that we allocated for reading/writing stripes
	 * directly from the disk (including P/Q).
	 */
	struct page **stripe_pages;

	/* Pointers to the sectors in the bio_list, for faster lookup */
	phys_addr_t *bio_paddrs;

	/* Pointers to the sectors in the stripe_pages[]. */
	phys_addr_t *stripe_paddrs;

	/* Each set bit means the corresponding sector in stripe_sectors[] is uptodate. */
	unsigned long *stripe_uptodate_bitmap;

	/* Allocated with real_stripes-many pointers for finish_*() calls */
	void **finish_pointers;

	/*
	 * The bitmap recording where IO errors happened.
	 * Each bit is corresponding to one sector in either bio_sectors[] or
	 * stripe_sectors[] array.
	 */
	unsigned long *error_bitmap;

	/*
	 * Checksum buffer if the rbio is for data.  The buffer should cover
	 * all data sectors (excluding P/Q sectors).
	 */
	u8 *csum_buf;

	/*
	 * Each bit represents if the corresponding sector has data csum found.
	 * Should only cover data sectors (excluding P/Q sectors).
	 */
	unsigned long *csum_bitmap;
};

/*
 * For trace event usage only. Records useful debug info for each bio submitted
 * by RAID56 to each physical device.
 *
 * No matter signed or not, (-1) is always the one indicating we can not grab
 * the proper stripe number.
 */
struct raid56_bio_trace_info {
	u64 devid;

	/* The offset inside the stripe. (<= STRIPE_LEN) */
	u32 offset;

	/*
	 * Stripe number.
	 * 0 is the first data stripe, and nr_data for P stripe,
	 * nr_data + 1 for Q stripe.
	 * >= real_stripes for
	 */
	u8 stripe_nr;
};

static inline int nr_data_stripes(const struct btrfs_chunk_map *map)
{
	return map->num_stripes - btrfs_nr_parity_stripes(map->type);
}

static inline int nr_bioc_data_stripes(const struct btrfs_io_context *bioc)
{
	return bioc->num_stripes - btrfs_nr_parity_stripes(bioc->map_type);
}

#define RAID5_P_STRIPE ((u64)-2)
#define RAID6_Q_STRIPE ((u64)-1)

#define is_parity_stripe(x) (((x) == RAID5_P_STRIPE) ||		\
			     ((x) == RAID6_Q_STRIPE))

struct btrfs_device;

void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc,
			   int mirror_num);
void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc);

struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio,
				struct btrfs_io_context *bioc,
				struct btrfs_device *scrub_dev,
				unsigned long *dbitmap, int stripe_nsectors);
void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio);

void raid56_parity_cache_data_folios(struct btrfs_raid_bio *rbio,
				     struct folio **data_folios, u64 data_logical);

int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info);
void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info);

#endif