/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2012 Fusion-io All rights reserved. * Copyright (C) 2012 Intel Corp. All rights reserved. */ #ifndef BTRFS_RAID56_H #define BTRFS_RAID56_H #include #include #include #include #include #include #include "volumes.h" struct page; struct btrfs_fs_info; enum btrfs_rbio_ops { BTRFS_RBIO_WRITE, BTRFS_RBIO_READ_REBUILD, BTRFS_RBIO_PARITY_SCRUB, }; /* * Overview of btrfs_raid_bio. * * One btrfs_raid_bio represents a full stripe of RAID56, including both data * and P/Q stripes. For now, each data and P/Q stripe is of a fixed length (64K). * * One btrfs_raid_bio can have one or more bios from higher layer, covering * part or all of the data stripes. * * [PAGES FROM HIGHER LAYER BIOS] * Higher layer bios are in the btrfs_raid_bio::bio_list. * * Pages from the bio_list are represented like the following: * * bio_list: |<- Bio 1 ->| |<- Bio 2 ->| ... * bio_paddrs: [0] [1] [2] [3] [4] [5] ... * * If there is a bio covering a sector (one btrfs fs block), the corresponding * pointer in btrfs_raid_bio::bio_paddrs[] will point to the physical address * (with the offset inside the page) of the corresponding bio. * * If there is no bio covering a sector, then btrfs_raid_bio::bio_paddrs[i] will * be INVALID_PADDR. * * The length of each entry in bio_paddrs[] is a step (aka, min(sectorsize, PAGE_SIZE)). * * [PAGES FOR INTERNAL USAGES] * Pages not covered by any bio or belonging to P/Q stripes are stored in * btrfs_raid_bio::stripe_pages[] and stripe_paddrs[], like the following: * * stripe_pages: |<- Page 0 ->|<- Page 1 ->| ... * stripe_paddrs: [0] [1] [2] [3] [4] ... * * stripe_pages[] array stores all the pages covering the full stripe, including * data and P/Q pages. * stripe_pages[0] is the first page of the first data stripe. * stripe_pages[BTRFS_STRIPE_LEN / PAGE_SIZE] is the first page of the second * data stripe. * * Some pointers inside stripe_pages[] can be NULL, e.g. for a full stripe write * (the bio covers all data stripes) there is no need to allocate pages for * data stripes (can grab from bio_paddrs[]). * * If the corresponding page of stripe_paddrs[i] is not allocated, the value of * stripe_paddrs[i] will be INVALID_PADDR. * * The length of each entry in stripe_paddrs[] is a step. * * [LOCATING A SECTOR] * To locate a sector for IO, we need the following info: * * - stripe_nr * Starts from 0 (representing the first data stripe), ends at * @nr_data (RAID5, P stripe) or @nr_data + 1 (RAID6, Q stripe). * * - sector_nr * Starts from 0 (representing the first sector of the stripe), ends * at BTRFS_STRIPE_LEN / sectorsize - 1. * * - step_nr * A step is min(sector_size, PAGE_SIZE). * * Starts from 0 (representing the first step of the sector), ends * at @sector_nsteps - 1. * * For most call sites they do not need to bother this parameter. * It is for bs > ps support and only for vertical stripe related works. * (e.g. RMW/recover) * * - from which array * Whether grabbing from stripe_paddrs[] (aka, internal pages) or from the * bio_paddrs[] (aka, from the higher layer bios). * * For IO, a physical address is returned, so that we can extract the page and * the offset inside the page for IO. * A special value INVALID_PADDR represents when the physical address is invalid, * normally meaning there is no page allocated for the specified sector. */ struct btrfs_raid_bio { struct btrfs_io_context *bioc; /* * While we're doing RMW on a stripe we put it into a hash table so we * can lock the stripe and merge more rbios into it. */ struct list_head hash_list; /* LRU list for the stripe cache */ struct list_head stripe_cache; /* For scheduling work in the helper threads */ struct work_struct work; /* * bio_list and bio_list_lock are used to add more bios into the stripe * in hopes of avoiding the full RMW */ struct bio_list bio_list; spinlock_t bio_list_lock; /* * Also protected by the bio_list_lock, the plug list is used by the * plugging code to collect partial bios while plugged. The stripe * locking code also uses it to hand off the stripe lock to the next * pending IO. */ struct list_head plug_list; /* Flags that tell us if it is safe to merge with this bio. */ unsigned long flags; /* * Set if we're doing a parity rebuild for a read from higher up, which * is handled differently from a parity rebuild as part of RMW. */ enum btrfs_rbio_ops operation; /* How many pages there are for the full stripe including P/Q */ u16 nr_pages; /* How many sectors there are for the full stripe including P/Q */ u16 nr_sectors; /* Number of data stripes (no p/q) */ u8 nr_data; /* Number of all stripes (including P/Q) */ u8 real_stripes; /* How many pages there are for each stripe */ u8 stripe_npages; /* How many sectors there are for each stripe */ u8 stripe_nsectors; /* * How many steps there are for one sector. * * For bs > ps cases, it's sectorsize / PAGE_SIZE. * For bs <= ps cases, it's always 1. */ u8 sector_nsteps; /* Stripe number that we're scrubbing */ u8 scrubp; /* * Size of all the bios in the bio_list. This helps us decide if the * rbio maps to a full stripe or not. */ int bio_list_bytes; refcount_t refs; atomic_t stripes_pending; wait_queue_head_t io_wait; /* Bitmap to record which horizontal stripe has data */ unsigned long dbitmap; /* Allocated with stripe_nsectors-many bits for finish_*() calls */ unsigned long finish_pbitmap; /* * These are two arrays of pointers. We allocate the rbio big enough * to hold them both and setup their locations when the rbio is * allocated. */ /* * Pointers to pages that we allocated for reading/writing stripes * directly from the disk (including P/Q). */ struct page **stripe_pages; /* Pointers to the sectors in the bio_list, for faster lookup */ phys_addr_t *bio_paddrs; /* Pointers to the sectors in the stripe_pages[]. */ phys_addr_t *stripe_paddrs; /* Each set bit means the corresponding sector in stripe_sectors[] is uptodate. */ unsigned long *stripe_uptodate_bitmap; /* Allocated with real_stripes-many pointers for finish_*() calls */ void **finish_pointers; /* * The bitmap recording where IO errors happened. * Each bit is corresponding to one sector in either bio_sectors[] or * stripe_sectors[] array. */ unsigned long *error_bitmap; /* * Checksum buffer if the rbio is for data. The buffer should cover * all data sectors (excluding P/Q sectors). */ u8 *csum_buf; /* * Each bit represents if the corresponding sector has data csum found. * Should only cover data sectors (excluding P/Q sectors). */ unsigned long *csum_bitmap; }; /* * For trace event usage only. Records useful debug info for each bio submitted * by RAID56 to each physical device. * * No matter signed or not, (-1) is always the one indicating we can not grab * the proper stripe number. */ struct raid56_bio_trace_info { u64 devid; /* The offset inside the stripe. (<= STRIPE_LEN) */ u32 offset; /* * Stripe number. * 0 is the first data stripe, and nr_data for P stripe, * nr_data + 1 for Q stripe. * >= real_stripes for */ u8 stripe_nr; }; static inline int nr_data_stripes(const struct btrfs_chunk_map *map) { return map->num_stripes - btrfs_nr_parity_stripes(map->type); } static inline int nr_bioc_data_stripes(const struct btrfs_io_context *bioc) { return bioc->num_stripes - btrfs_nr_parity_stripes(bioc->map_type); } #define RAID5_P_STRIPE ((u64)-2) #define RAID6_Q_STRIPE ((u64)-1) #define is_parity_stripe(x) (((x) == RAID5_P_STRIPE) || \ ((x) == RAID6_Q_STRIPE)) struct btrfs_device; void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, int mirror_num); void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc); struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio, struct btrfs_io_context *bioc, struct btrfs_device *scrub_dev, unsigned long *dbitmap, int stripe_nsectors); void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio); void raid56_parity_cache_data_folios(struct btrfs_raid_bio *rbio, struct folio **data_folios, u64 data_logical); int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info); void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info); #endif