diff options
Diffstat (limited to 'include')
31 files changed, 2004 insertions, 388 deletions
diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index 0b5b1af35e5e..e850e76acaaf 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -116,6 +116,8 @@ struct bdi_writeback { struct list_head work_list; struct delayed_work dwork; /* work item used for writeback */ + unsigned long dirty_sleep; /* last wait */ + struct list_head bdi_node; /* anchored at bdi->wb_list */ #ifdef CONFIG_CGROUP_WRITEBACK diff --git a/include/linux/bio.h b/include/linux/bio.h index 97cb48f03dc7..7cf8a6c70a3f 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -63,6 +63,12 @@ #define bio_end_sector(bio) ((bio)->bi_iter.bi_sector + bio_sectors((bio))) /* + * Return the data direction, READ or WRITE. + */ +#define bio_data_dir(bio) \ + (op_is_write(bio_op(bio)) ? WRITE : READ) + +/* * Check whether this bio carries any data or not. A NULL bio is allowed. */ static inline bool bio_has_data(struct bio *bio) @@ -70,7 +76,8 @@ static inline bool bio_has_data(struct bio *bio) if (bio && bio->bi_iter.bi_size && bio_op(bio) != REQ_OP_DISCARD && - bio_op(bio) != REQ_OP_SECURE_ERASE) + bio_op(bio) != REQ_OP_SECURE_ERASE && + bio_op(bio) != REQ_OP_WRITE_ZEROES) return true; return false; @@ -80,18 +87,8 @@ static inline bool bio_no_advance_iter(struct bio *bio) { return bio_op(bio) == REQ_OP_DISCARD || bio_op(bio) == REQ_OP_SECURE_ERASE || - bio_op(bio) == REQ_OP_WRITE_SAME; -} - -static inline bool bio_is_rw(struct bio *bio) -{ - if (!bio_has_data(bio)) - return false; - - if (bio_no_advance_iter(bio)) - return false; - - return true; + bio_op(bio) == REQ_OP_WRITE_SAME || + bio_op(bio) == REQ_OP_WRITE_ZEROES; } static inline bool bio_mergeable(struct bio *bio) @@ -193,18 +190,20 @@ static inline unsigned bio_segments(struct bio *bio) struct bvec_iter iter; /* - * We special case discard/write same, because they interpret bi_size - * differently: + * We special case discard/write same/write zeroes, because they + * interpret bi_size differently: */ - if (bio_op(bio) == REQ_OP_DISCARD) - return 1; - - if (bio_op(bio) == REQ_OP_SECURE_ERASE) - return 1; - - if (bio_op(bio) == REQ_OP_WRITE_SAME) + switch (bio_op(bio)) { + case REQ_OP_DISCARD: + case REQ_OP_SECURE_ERASE: + case REQ_OP_WRITE_ZEROES: + return 0; + case REQ_OP_WRITE_SAME: return 1; + default: + break; + } bio_for_each_segment(bv, bio, iter) segs++; @@ -409,6 +408,8 @@ static inline struct bio *bio_clone_kmalloc(struct bio *bio, gfp_t gfp_mask) } +extern blk_qc_t submit_bio(struct bio *); + extern void bio_endio(struct bio *); static inline void bio_io_error(struct bio *bio) @@ -423,13 +424,15 @@ extern int bio_phys_segments(struct request_queue *, struct bio *); extern int submit_bio_wait(struct bio *bio); extern void bio_advance(struct bio *, unsigned); -extern void bio_init(struct bio *); +extern void bio_init(struct bio *bio, struct bio_vec *table, + unsigned short max_vecs); extern void bio_reset(struct bio *); void bio_chain(struct bio *, struct bio *); extern int bio_add_page(struct bio *, struct page *, unsigned int,unsigned int); extern int bio_add_pc_page(struct request_queue *, struct bio *, struct page *, unsigned int, unsigned int); +int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter); struct rq_map_data; extern struct bio *bio_map_user_iov(struct request_queue *, const struct iov_iter *, gfp_t); diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 3bf5d33800ab..01b62e7bac74 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -581,15 +581,14 @@ static inline void blkg_rwstat_exit(struct blkg_rwstat *rwstat) /** * blkg_rwstat_add - add a value to a blkg_rwstat * @rwstat: target blkg_rwstat - * @op: REQ_OP - * @op_flags: rq_flag_bits + * @op: REQ_OP and flags * @val: value to add * * Add @val to @rwstat. The counters are chosen according to @rw. The * caller is responsible for synchronizing calls to this function. */ static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat, - int op, int op_flags, uint64_t val) + unsigned int op, uint64_t val) { struct percpu_counter *cnt; @@ -600,7 +599,7 @@ static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat, __percpu_counter_add(cnt, val, BLKG_STAT_CPU_BATCH); - if (op_flags & REQ_SYNC) + if (op_is_sync(op)) cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_SYNC]; else cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_ASYNC]; @@ -705,9 +704,9 @@ static inline bool blkcg_bio_issue_check(struct request_queue *q, if (!throtl) { blkg = blkg ?: q->root_blkg; - blkg_rwstat_add(&blkg->stat_bytes, bio_op(bio), bio->bi_opf, + blkg_rwstat_add(&blkg->stat_bytes, bio->bi_opf, bio->bi_iter.bi_size); - blkg_rwstat_add(&blkg->stat_ios, bio_op(bio), bio->bi_opf, 1); + blkg_rwstat_add(&blkg->stat_ios, bio->bi_opf, 1); } rcu_read_unlock(); diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 535ab2e13d2e..87e404aae267 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -3,6 +3,7 @@ #include <linux/blkdev.h> #include <linux/sbitmap.h> +#include <linux/srcu.h> struct blk_mq_tags; struct blk_flush_queue; @@ -35,6 +36,8 @@ struct blk_mq_hw_ctx { struct blk_mq_tags *tags; + struct srcu_struct queue_rq_srcu; + unsigned long queued; unsigned long run; #define BLK_MQ_MAX_DISPATCH_ORDER 7 @@ -215,18 +218,20 @@ void blk_mq_start_request(struct request *rq); void blk_mq_end_request(struct request *rq, int error); void __blk_mq_end_request(struct request *rq, int error); -void blk_mq_requeue_request(struct request *rq); -void blk_mq_add_to_requeue_list(struct request *rq, bool at_head); -void blk_mq_cancel_requeue_work(struct request_queue *q); +void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list); +void blk_mq_add_to_requeue_list(struct request *rq, bool at_head, + bool kick_requeue_list); void blk_mq_kick_requeue_list(struct request_queue *q); void blk_mq_delay_kick_requeue_list(struct request_queue *q, unsigned long msecs); void blk_mq_abort_requeue_list(struct request_queue *q); void blk_mq_complete_request(struct request *rq, int error); +bool blk_mq_queue_stopped(struct request_queue *q); void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx); void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx); void blk_mq_stop_hw_queues(struct request_queue *q); void blk_mq_start_hw_queues(struct request_queue *q); +void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async); void blk_mq_run_hw_queues(struct request_queue *q, bool async); void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs); diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index cd395ecec99d..519ea2c9df61 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -17,7 +17,6 @@ struct io_context; struct cgroup_subsys_state; typedef void (bio_end_io_t) (struct bio *); -#ifdef CONFIG_BLOCK /* * main unit of I/O for the block layer and lower layers (ie drivers and * stacking drivers) @@ -88,24 +87,6 @@ struct bio { struct bio_vec bi_inline_vecs[0]; }; -#define BIO_OP_SHIFT (8 * FIELD_SIZEOF(struct bio, bi_opf) - REQ_OP_BITS) -#define bio_flags(bio) ((bio)->bi_opf & ((1 << BIO_OP_SHIFT) - 1)) -#define bio_op(bio) ((bio)->bi_opf >> BIO_OP_SHIFT) - -#define bio_set_op_attrs(bio, op, op_flags) do { \ - if (__builtin_constant_p(op)) \ - BUILD_BUG_ON((op) + 0U >= (1U << REQ_OP_BITS)); \ - else \ - WARN_ON_ONCE((op) + 0U >= (1U << REQ_OP_BITS)); \ - if (__builtin_constant_p(op_flags)) \ - BUILD_BUG_ON((op_flags) + 0U >= (1U << BIO_OP_SHIFT)); \ - else \ - WARN_ON_ONCE((op_flags) + 0U >= (1U << BIO_OP_SHIFT)); \ - (bio)->bi_opf = bio_flags(bio); \ - (bio)->bi_opf |= (((op) + 0U) << BIO_OP_SHIFT); \ - (bio)->bi_opf |= (op_flags); \ -} while (0) - #define BIO_RESET_BYTES offsetof(struct bio, bi_max_vecs) /* @@ -119,6 +100,8 @@ struct bio { #define BIO_QUIET 6 /* Make BIO Quiet */ #define BIO_CHAIN 7 /* chained bio, ->bi_remaining in effect */ #define BIO_REFFED 8 /* bio has elevated ->bi_cnt */ +#define BIO_THROTTLED 9 /* This bio has already been subjected to + * throttling rules. Don't do it again. */ /* * Flags starting here get preserved by bio_reset() - this includes @@ -142,53 +125,61 @@ struct bio { #define BVEC_POOL_OFFSET (16 - BVEC_POOL_BITS) #define BVEC_POOL_IDX(bio) ((bio)->bi_flags >> BVEC_POOL_OFFSET) -#endif /* CONFIG_BLOCK */ - /* - * Request flags. For use in the cmd_flags field of struct request, and in - * bi_opf of struct bio. Note that some flags are only valid in either one. + * Operations and flags common to the bio and request structures. + * We use 8 bits for encoding the operation, and the remaining 24 for flags. + * + * The least significant bit of the operation number indicates the data + * transfer direction: + * + * - if the least significant bit is set transfers are TO the device + * - if the least significant bit is not set transfers are FROM the device + * + * If a operation does not transfer data the least significant bit has no + * meaning. */ -enum rq_flag_bits { - /* common flags */ - __REQ_FAILFAST_DEV, /* no driver retries of device errors */ +#define REQ_OP_BITS 8 +#define REQ_OP_MASK ((1 << REQ_OP_BITS) - 1) +#define REQ_FLAG_BITS 24 + +enum req_opf { + /* read sectors from the device */ + REQ_OP_READ = 0, + /* write sectors to the device */ + REQ_OP_WRITE = 1, + /* flush the volatile write cache */ + REQ_OP_FLUSH = 2, + /* discard sectors */ + REQ_OP_DISCARD = 3, + /* get zone information */ + REQ_OP_ZONE_REPORT = 4, + /* securely erase sectors */ + REQ_OP_SECURE_ERASE = 5, + /* seset a zone write pointer */ + REQ_OP_ZONE_RESET = 6, + /* write the same sector many times */ + REQ_OP_WRITE_SAME = 7, + /* write the zero filled sector many times */ + REQ_OP_WRITE_ZEROES = 8, + + REQ_OP_LAST, +}; + +enum req_flag_bits { + __REQ_FAILFAST_DEV = /* no driver retries of device errors */ + REQ_OP_BITS, __REQ_FAILFAST_TRANSPORT, /* no driver retries of transport errors */ __REQ_FAILFAST_DRIVER, /* no driver retries of driver errors */ - __REQ_SYNC, /* request is sync (sync write or read) */ __REQ_META, /* metadata io request */ __REQ_PRIO, /* boost priority in cfq */ - - __REQ_NOIDLE, /* don't anticipate more IO after this one */ + __REQ_NOMERGE, /* don't touch this for merging */ + __REQ_IDLE, /* anticipate more IO after this one */ __REQ_INTEGRITY, /* I/O includes block integrity payload */ __REQ_FUA, /* forced unit access */ __REQ_PREFLUSH, /* request for cache flush */ - - /* bio only flags */ __REQ_RAHEAD, /* read ahead, can fail anytime */ - __REQ_THROTTLED, /* This bio has already been subjected to - * throttling rules. Don't do it again. */ - - /* request only flags */ - __REQ_SORTED, /* elevator knows about this request */ - __REQ_SOFTBARRIER, /* may not be passed by ioscheduler */ - __REQ_NOMERGE, /* don't touch this for merging */ - __REQ_STARTED, /* drive already may have started this one */ - __REQ_DONTPREP, /* don't call prep for this one */ - __REQ_QUEUED, /* uses queueing */ - __REQ_ELVPRIV, /* elevator private data attached */ - __REQ_FAILED, /* set if the request failed */ - __REQ_QUIET, /* don't worry about errors */ - __REQ_PREEMPT, /* set for "ide_preempt" requests and also - for requests for which the SCSI "quiesce" - state must be ignored. */ - __REQ_ALLOCED, /* request came from our alloc pool */ - __REQ_COPY_USER, /* contains copies of user pages */ - __REQ_FLUSH_SEQ, /* request for flush sequence */ - __REQ_IO_STAT, /* account I/O stat */ - __REQ_MIXED_MERGE, /* merge of different types, fail separately */ - __REQ_PM, /* runtime pm request */ - __REQ_HASHED, /* on IO scheduler merge hash */ - __REQ_MQ_INFLIGHT, /* track inflight for MQ */ + __REQ_BACKGROUND, /* background IO */ __REQ_NR_BITS, /* stops here */ }; @@ -198,54 +189,47 @@ enum rq_flag_bits { #define REQ_SYNC (1ULL << __REQ_SYNC) #define REQ_META (1ULL << __REQ_META) #define REQ_PRIO (1ULL << __REQ_PRIO) -#define REQ_NOIDLE (1ULL << __REQ_NOIDLE) +#define REQ_NOMERGE (1ULL << __REQ_NOMERGE) +#define REQ_IDLE (1ULL << __REQ_IDLE) #define REQ_INTEGRITY (1ULL << __REQ_INTEGRITY) +#define REQ_FUA (1ULL << __REQ_FUA) +#define REQ_PREFLUSH (1ULL << __REQ_PREFLUSH) +#define REQ_RAHEAD (1ULL << __REQ_RAHEAD) +#define REQ_BACKGROUND (1ULL << __REQ_BACKGROUND) #define REQ_FAILFAST_MASK \ (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER) -#define REQ_COMMON_MASK \ - (REQ_FAILFAST_MASK | REQ_SYNC | REQ_META | REQ_PRIO | REQ_NOIDLE | \ - REQ_PREFLUSH | REQ_FUA | REQ_INTEGRITY | REQ_NOMERGE) -#define REQ_CLONE_MASK REQ_COMMON_MASK -/* This mask is used for both bio and request merge checking */ #define REQ_NOMERGE_FLAGS \ - (REQ_NOMERGE | REQ_STARTED | REQ_SOFTBARRIER | REQ_PREFLUSH | REQ_FUA | REQ_FLUSH_SEQ) + (REQ_NOMERGE | REQ_PREFLUSH | REQ_FUA) -#define REQ_RAHEAD (1ULL << __REQ_RAHEAD) -#define REQ_THROTTLED (1ULL << __REQ_THROTTLED) +#define bio_op(bio) \ + ((bio)->bi_opf & REQ_OP_MASK) +#define req_op(req) \ + ((req)->cmd_flags & REQ_OP_MASK) -#define REQ_SORTED (1ULL << __REQ_SORTED) -#define REQ_SOFTBARRIER (1ULL << __REQ_SOFTBARRIER) -#define REQ_FUA (1ULL << __REQ_FUA) -#define REQ_NOMERGE (1ULL << __REQ_NOMERGE) -#define REQ_STARTED (1ULL << __REQ_STARTED) -#define REQ_DONTPREP (1ULL << __REQ_DONTPREP) -#define REQ_QUEUED (1ULL << __REQ_QUEUED) -#define REQ_ELVPRIV (1ULL << __REQ_ELVPRIV) -#define REQ_FAILED (1ULL << __REQ_FAILED) -#define REQ_QUIET (1ULL << __REQ_QUIET) -#define REQ_PREEMPT (1ULL << __REQ_PREEMPT) -#define REQ_ALLOCED (1ULL << __REQ_ALLOCED) -#define REQ_COPY_USER (1ULL << __REQ_COPY_USER) -#define REQ_PREFLUSH (1ULL << __REQ_PREFLUSH) -#define REQ_FLUSH_SEQ (1ULL << __REQ_FLUSH_SEQ) -#define REQ_IO_STAT (1ULL << __REQ_IO_STAT) -#define REQ_MIXED_MERGE (1ULL << __REQ_MIXED_MERGE) -#define REQ_PM (1ULL << __REQ_PM) -#define REQ_HASHED (1ULL << __REQ_HASHED) -#define REQ_MQ_INFLIGHT (1ULL << __REQ_MQ_INFLIGHT) - -enum req_op { - REQ_OP_READ, - REQ_OP_WRITE, - REQ_OP_DISCARD, /* request to discard sectors */ - REQ_OP_SECURE_ERASE, /* request to securely erase sectors */ - REQ_OP_WRITE_SAME, /* write same block many times */ - REQ_OP_FLUSH, /* request for cache flush */ -}; +/* obsolete, don't use in new code */ +static inline void bio_set_op_attrs(struct bio *bio, unsigned op, + unsigned op_flags) +{ + bio->bi_opf = op | op_flags; +} -#define REQ_OP_BITS 3 +static inline bool op_is_write(unsigned int op) +{ + return (op & 1); +} + +/* + * Reads are always treated as synchronous, as are requests with the FUA or + * PREFLUSH flag. Other operations may be marked as synchronous using the + * REQ_SYNC flag. + */ +static inline bool op_is_sync(unsigned int op) +{ + return (op & REQ_OP_MASK) == REQ_OP_READ || + (op & (REQ_SYNC | REQ_FUA | REQ_PREFLUSH)); +} typedef unsigned int blk_qc_t; #define BLK_QC_T_NONE -1U @@ -271,4 +255,20 @@ static inline unsigned int blk_qc_t_to_tag(blk_qc_t cookie) return cookie & ((1u << BLK_QC_T_SHIFT) - 1); } +struct blk_issue_stat { + u64 time; +}; + +#define BLK_RQ_STAT_BATCH 64 + +struct blk_rq_stat { + s64 mean; + u64 min; + u64 max; + s32 nr_samples; + s32 nr_batch; + u64 batch; + s64 time; +}; + #endif /* __LINUX_BLK_TYPES_H */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index c47c358ba052..c5393766909d 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -24,6 +24,7 @@ #include <linux/rcupdate.h> #include <linux/percpu-refcount.h> #include <linux/scatterlist.h> +#include <linux/blkzoned.h> struct module; struct scsi_ioctl_command; @@ -37,6 +38,7 @@ struct bsg_job; struct blkcg_gq; struct blk_flush_queue; struct pr_ops; +struct rq_wb; #define BLKDEV_MIN_RQ 4 #define BLKDEV_MAX_RQ 128 /* Default maximum */ @@ -77,6 +79,55 @@ enum rq_cmd_type_bits { REQ_TYPE_DRV_PRIV, /* driver defined types from here */ }; +/* + * request flags */ +typedef __u32 __bitwise req_flags_t; + +/* elevator knows about this request */ +#define RQF_SORTED ((__force req_flags_t)(1 << 0)) +/* drive already may have started this one */ +#define RQF_STARTED ((__force req_flags_t)(1 << 1)) +/* uses tagged queueing */ +#define RQF_QUEUED ((__force req_flags_t)(1 << 2)) +/* may not be passed by ioscheduler */ +#define RQF_SOFTBARRIER ((__force req_flags_t)(1 << 3)) +/* request for flush sequence */ +#define RQF_FLUSH_SEQ ((__force req_flags_t)(1 << 4)) +/* merge of different types, fail separately */ +#define RQF_MIXED_MERGE ((__force req_flags_t)(1 << 5)) +/* track inflight for MQ */ +#define RQF_MQ_INFLIGHT ((__force req_flags_t)(1 << 6)) +/* don't call prep for this one */ +#define RQF_DONTPREP ((__force req_flags_t)(1 << 7)) +/* set for "ide_preempt" requests and also for requests for which the SCSI + "quiesce" state must be ignored. */ +#define RQF_PREEMPT ((__force req_flags_t)(1 << 8)) +/* contains copies of user pages */ +#define RQF_COPY_USER ((__force req_flags_t)(1 << 9)) +/* vaguely specified driver internal error. Ignored by the block layer */ +#define RQF_FAILED ((__force req_flags_t)(1 << 10)) +/* don't warn about errors */ +#define RQF_QUIET ((__force req_flags_t)(1 << 11)) +/* elevator private data attached */ +#define RQF_ELVPRIV ((__force req_flags_t)(1 << 12)) +/* account I/O stat */ +#define RQF_IO_STAT ((__force req_flags_t)(1 << 13)) +/* request came from our alloc pool */ +#define RQF_ALLOCED ((__force req_flags_t)(1 << 14)) +/* runtime pm request */ +#define RQF_PM ((__force req_flags_t)(1 << 15)) +/* on IO scheduler merge hash */ +#define RQF_HASHED ((__force req_flags_t)(1 << 16)) +/* IO stats tracking on */ +#define RQF_STATS ((__force req_flags_t)(1 << 17)) +/* Look at ->special_vec for the actual data payload instead of the + bio chain. */ +#define RQF_SPECIAL_PAYLOAD ((__force req_flags_t)(1 << 18)) + +/* flags that prevent us from merging requests: */ +#define RQF_NOMERGE_FLAGS \ + (RQF_STARTED | RQF_SOFTBARRIER | RQF_FLUSH_SEQ | RQF_SPECIAL_PAYLOAD) + #define BLK_MAX_CDB 16 /* @@ -97,7 +148,8 @@ struct request { int cpu; unsigned cmd_type; - u64 cmd_flags; + unsigned int cmd_flags; /* op and common flags */ + req_flags_t rq_flags; unsigned long atomic_flags; /* the following two fields are internal, NEVER access directly */ @@ -126,6 +178,7 @@ struct request { */ union { struct rb_node rb_node; /* sort/lookup */ + struct bio_vec special_vec; void *completion_data; }; @@ -151,6 +204,7 @@ struct request { struct gendisk *rq_disk; struct hd_struct *part; unsigned long start_time; + struct blk_issue_stat issue_stat; #ifdef CONFIG_BLK_CGROUP struct request_list *rl; /* rl this rq is alloced from */ unsigned long long start_time_ns; @@ -198,20 +252,6 @@ struct request { struct request *next_rq; }; -#define REQ_OP_SHIFT (8 * sizeof(u64) - REQ_OP_BITS) -#define req_op(req) ((req)->cmd_flags >> REQ_OP_SHIFT) - -#define req_set_op(req, op) do { \ - WARN_ON(op >= (1 << REQ_OP_BITS)); \ - (req)->cmd_flags &= ((1ULL << REQ_OP_SHIFT) - 1); \ - (req)->cmd_flags |= ((u64) (op) << REQ_OP_SHIFT); \ -} while (0) - -#define req_set_op_attrs(req, op, flags) do { \ - req_set_op(req, op); \ - (req)->cmd_flags |= flags; \ -} while (0) - static inline unsigned short req_get_ioprio(struct request *req) { return req->ioprio; @@ -261,6 +301,15 @@ struct blk_queue_tag { #define BLK_SCSI_MAX_CMDS (256) #define BLK_SCSI_CMD_PER_LONG (BLK_SCSI_MAX_CMDS / (sizeof(long) * 8)) +/* + * Zoned block device models (zoned limit). + */ +enum blk_zoned_model { + BLK_ZONED_NONE, /* Regular block device */ + BLK_ZONED_HA, /* Host-aware zoned block device */ + BLK_ZONED_HM, /* Host-managed zoned block device */ +}; + struct queue_limits { unsigned long bounce_pfn; unsigned long seg_boundary_mask; @@ -278,6 +327,7 @@ struct queue_limits { unsigned int max_discard_sectors; unsigned int max_hw_discard_sectors; unsigned int max_write_same_sectors; + unsigned int max_write_zeroes_sectors; unsigned int discard_granularity; unsigned int discard_alignment; @@ -290,8 +340,45 @@ struct queue_limits { unsigned char cluster; unsigned char discard_zeroes_data; unsigned char raid_partial_stripes_expensive; + enum blk_zoned_model zoned; }; +#ifdef CONFIG_BLK_DEV_ZONED + +struct blk_zone_report_hdr { + unsigned int nr_zones; + u8 padding[60]; +}; + +extern int blkdev_report_zones(struct block_device *bdev, + sector_t sector, struct blk_zone *zones, + unsigned int *nr_zones, gfp_t gfp_mask); +extern int blkdev_reset_zones(struct block_device *bdev, sector_t sectors, + sector_t nr_sectors, gfp_t gfp_mask); + +extern int blkdev_report_zones_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long arg); +extern int blkdev_reset_zones_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long arg); + +#else /* CONFIG_BLK_DEV_ZONED */ + +static inline int blkdev_report_zones_ioctl(struct block_device *bdev, + fmode_t mode, unsigned int cmd, + unsigned long arg) +{ + return -ENOTTY; +} + +static inline int blkdev_reset_zones_ioctl(struct block_device *bdev, + fmode_t mode, unsigned int cmd, + unsigned long arg) +{ + return -ENOTTY; +} + +#endif /* CONFIG_BLK_DEV_ZONED */ + struct request_queue { /* * Together with queue_head for cacheline sharing @@ -302,6 +389,8 @@ struct request_queue { int nr_rqs[2]; /* # allocated [a]sync rqs */ int nr_rqs_elvpriv; /* # allocated rqs w/ elvpriv */ + struct rq_wb *rq_wb; + /* * If blkcg is not used, @q->root_rl serves all requests. If blkcg * is used, root blkg allocates from @q->root_rl and all other @@ -327,6 +416,8 @@ struct request_queue { struct blk_mq_ctx __percpu *queue_ctx; unsigned int nr_queues; + unsigned int queue_depth; + /* hw dispatch queues */ struct blk_mq_hw_ctx **queue_hw_ctx; unsigned int nr_hw_queues; @@ -412,6 +503,9 @@ struct request_queue { unsigned int nr_sorted; unsigned int in_flight[2]; + + struct blk_rq_stat rq_stats[2]; + /* * Number of active block driver functions for which blk_drain_queue() * must wait. Must be incremented around functions that unlock the @@ -420,6 +514,7 @@ struct request_queue { unsigned int request_fn_active; unsigned int rq_timeout; + int poll_nsec; struct timer_list timeout; struct work_struct timeout_work; struct list_head timeout_list; @@ -505,6 +600,7 @@ struct request_queue { #define QUEUE_FLAG_FUA 24 /* device supports FUA writes */ #define QUEUE_FLAG_FLUSH_NQ 25 /* flush not queueuable */ #define QUEUE_FLAG_DAX 26 /* device supports DAX */ +#define QUEUE_FLAG_STATS 27 /* track rq completion times */ #define QUEUE_FLAG_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ (1 << QUEUE_FLAG_STACKABLE) | \ @@ -601,7 +697,7 @@ static inline void queue_flag_clear(unsigned int flag, struct request_queue *q) REQ_FAILFAST_DRIVER)) #define blk_account_rq(rq) \ - (((rq)->cmd_flags & REQ_STARTED) && \ + (((rq)->rq_flags & RQF_STARTED) && \ ((rq)->cmd_type == REQ_TYPE_FS)) #define blk_rq_cpu_valid(rq) ((rq)->cpu != -1) @@ -627,17 +723,31 @@ static inline unsigned int blk_queue_cluster(struct request_queue *q) return q->limits.cluster; } -/* - * We regard a request as sync, if either a read or a sync write - */ -static inline bool rw_is_sync(int op, unsigned int rw_flags) +static inline enum blk_zoned_model +blk_queue_zoned_model(struct request_queue *q) { - return op == REQ_OP_READ || (rw_flags & REQ_SYNC); + return q->limits.zoned; +} + +static inline bool blk_queue_is_zoned(struct request_queue *q) +{ + switch (blk_queue_zoned_model(q)) { + case BLK_ZONED_HA: + case BLK_ZONED_HM: + return true; + default: + return false; + } +} + +static inline unsigned int blk_queue_zone_size(struct request_queue *q) +{ + return blk_queue_is_zoned(q) ? q->limits.chunk_sectors : 0; } static inline bool rq_is_sync(struct request *rq) { - return rw_is_sync(req_op(rq), rq->cmd_flags); + return op_is_sync(rq->cmd_flags); } static inline bool blk_rl_full(struct request_list *rl, bool sync) @@ -669,8 +779,13 @@ static inline bool rq_mergeable(struct request *rq) if (req_op(rq) == REQ_OP_FLUSH) return false; + if (req_op(rq) == REQ_OP_WRITE_ZEROES) + return false; + if (rq->cmd_flags & REQ_NOMERGE_FLAGS) return false; + if (rq->rq_flags & RQF_NOMERGE_FLAGS) + return false; return true; } @@ -683,6 +798,14 @@ static inline bool blk_write_same_mergeable(struct bio *a, struct bio *b) return false; } +static inline unsigned int blk_queue_depth(struct request_queue *q) +{ + if (q->queue_depth) + return q->queue_depth; + + return q->nr_requests; +} + /* * q->prep_rq_fn return values */ @@ -790,8 +913,6 @@ extern void __blk_put_request(struct request_queue *, struct request *); extern struct request *blk_get_request(struct request_queue *, int, gfp_t); extern void blk_rq_set_block_pc(struct request *); extern void blk_requeue_request(struct request_queue *, struct request *); -extern void blk_add_request_payload(struct request *rq, struct page *page, - int offset, unsigned int len); extern int blk_lld_busy(struct request_queue *q); extern int blk_rq_prep_clone(struct request *rq, struct request *rq_src, struct bio_set *bs, gfp_t gfp_mask, @@ -824,6 +945,7 @@ extern void __blk_run_queue(struct request_queue *q); extern void __blk_run_queue_uncond(struct request_queue *q); extern void blk_run_queue(struct request_queue *); extern void blk_run_queue_async(struct request_queue *q); +extern void blk_mq_quiesce_queue(struct request_queue *q); extern int blk_rq_map_user(struct request_queue *, struct request *, struct rq_map_data *, void __user *, unsigned long, gfp_t); @@ -837,7 +959,7 @@ extern int blk_execute_rq(struct request_queue *, struct gendisk *, extern void blk_execute_rq_nowait(struct request_queue *, struct gendisk *, struct request *, int, rq_end_io_fn *); -bool blk_poll(struct request_queue *q, blk_qc_t cookie); +bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie); static inline struct request_queue *bdev_get_queue(struct block_device *bdev) { @@ -888,6 +1010,9 @@ static inline unsigned int blk_queue_get_max_sectors(struct request_queue *q, if (unlikely(op == REQ_OP_WRITE_SAME)) return q->limits.max_write_same_sectors; + if (unlikely(op == REQ_OP_WRITE_ZEROES)) + return q->limits.max_write_zeroes_sectors; + return q->limits.max_sectors; } @@ -991,6 +1116,8 @@ extern void blk_queue_max_discard_sectors(struct request_queue *q, unsigned int max_discard_sectors); extern void blk_queue_max_write_same_sectors(struct request_queue *q, unsigned int max_write_same_sectors); +extern void blk_queue_max_write_zeroes_sectors(struct request_queue *q, + unsigned int max_write_same_sectors); extern void blk_queue_logical_block_size(struct request_queue *, unsigned short); extern void blk_queue_physical_block_size(struct request_queue *, unsigned int); extern void blk_queue_alignment_offset(struct request_queue *q, @@ -999,6 +1126,7 @@ extern void blk_limits_io_min(struct queue_limits *limits, unsigned int min); extern void blk_queue_io_min(struct request_queue *q, unsigned int min); extern void blk_limits_io_opt(struct queue_limits *limits, unsigned int opt); extern void blk_queue_io_opt(struct request_queue *q, unsigned int opt); +extern void blk_set_queue_depth(struct request_queue *q, unsigned int depth); extern void blk_set_default_limits(struct queue_limits *lim); extern void blk_set_stacking_limits(struct queue_limits *lim); extern int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, @@ -1027,6 +1155,13 @@ extern void blk_queue_flush_queueable(struct request_queue *q, bool queueable); extern void blk_queue_write_cache(struct request_queue *q, bool enabled, bool fua); extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev); +static inline unsigned short blk_rq_nr_phys_segments(struct request *rq) +{ + if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) + return 1; + return rq->nr_phys_segments; +} + extern int blk_rq_map_sg(struct request_queue *, struct request *, struct scatterlist *); extern void blk_dump_rq_flags(struct request *, char *); extern long nr_blockdev_pages(void); @@ -1057,7 +1192,7 @@ static inline int blk_pre_runtime_suspend(struct request_queue *q) static inline void blk_post_runtime_suspend(struct request_queue *q, int err) {} static inline void blk_pre_runtime_resume(struct request_queue *q) {} static inline void blk_post_runtime_resume(struct request_queue *q, int err) {} -extern inline void blk_set_runtime_active(struct request_queue *q) {} +static inline void blk_set_runtime_active(struct request_queue *q) {} #endif /* @@ -1078,6 +1213,7 @@ struct blk_plug { struct list_head cb_list; /* md requires an unplug callback */ }; #define BLK_MAX_REQUEST_COUNT 16 +#define BLK_PLUG_FLUSH_SIZE (128 * 1024) struct blk_plug_cb; typedef void (*blk_plug_cb_fn)(struct blk_plug_cb *, bool); @@ -1151,6 +1287,9 @@ extern int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, struct bio **biop); extern int blkdev_issue_write_same(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, struct page *page); +extern int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, + sector_t nr_sects, gfp_t gfp_mask, struct bio **biop, + bool discard); extern int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, bool discard); static inline int sb_issue_discard(struct super_block *sb, sector_t block, @@ -1354,6 +1493,46 @@ static inline unsigned int bdev_write_same(struct block_device *bdev) return 0; } +static inline unsigned int bdev_write_zeroes_sectors(struct block_device *bdev) +{ + struct request_queue *q = bdev_get_queue(bdev); + + if (q) + return q->limits.max_write_zeroes_sectors; + + return 0; +} + +static inline enum blk_zoned_model bdev_zoned_model(struct block_device *bdev) +{ + struct request_queue *q = bdev_get_queue(bdev); + + if (q) + return blk_queue_zoned_model(q); + + return BLK_ZONED_NONE; +} + +static inline bool bdev_is_zoned(struct block_device *bdev) +{ + struct request_queue *q = bdev_get_queue(bdev); + + if (q) + return blk_queue_is_zoned(q); + + return false; +} + +static inline unsigned int bdev_zone_size(struct block_device *bdev) +{ + struct request_queue *q = bdev_get_queue(bdev); + + if (q) + return blk_queue_zone_size(q); + + return 0; +} + static inline int queue_dma_alignment(struct request_queue *q) { return q ? q->dma_alignment : 511; diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h index cceb72f9e29f..e417f080219a 100644 --- a/include/linux/blktrace_api.h +++ b/include/linux/blktrace_api.h @@ -118,7 +118,7 @@ static inline int blk_cmd_buf_len(struct request *rq) } extern void blk_dump_cmd(char *buf, struct request *rq); -extern void blk_fill_rwbs(char *rwbs, int op, u32 rw, int bytes); +extern void blk_fill_rwbs(char *rwbs, unsigned int op, int bytes); #endif /* CONFIG_EVENT_TRACING && CONFIG_BLOCK */ diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index 8dbd7879fdc6..67bcef2ecddb 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -1,7 +1,7 @@ #ifndef __FS_CEPH_MESSENGER_H #define __FS_CEPH_MESSENGER_H -#include <linux/blk_types.h> +#include <linux/bvec.h> #include <linux/kref.h> #include <linux/mutex.h> #include <linux/net.h> diff --git a/include/linux/dm-io.h b/include/linux/dm-io.h index b91b023deffb..a52c6580cc9a 100644 --- a/include/linux/dm-io.h +++ b/include/linux/dm-io.h @@ -58,7 +58,7 @@ struct dm_io_notify { struct dm_io_client; struct dm_io_request { int bi_op; /* REQ_OP */ - int bi_op_flags; /* rq_flag_bits */ + int bi_op_flags; /* req_flag_bits */ struct dm_io_memory mem; /* Memory to use for io */ struct dm_io_notify notify; /* Synchronous if notify.fn is NULL */ struct dm_io_client *client; /* Client memory handler */ diff --git a/include/linux/elevator.h b/include/linux/elevator.h index e7f358d2e5fc..b276e9ef0e0b 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -30,7 +30,7 @@ typedef int (elevator_dispatch_fn) (struct request_queue *, int); typedef void (elevator_add_req_fn) (struct request_queue *, struct request *); typedef struct request *(elevator_request_list_fn) (struct request_queue *, struct request *); typedef void (elevator_completed_req_fn) (struct request_queue *, struct request *); -typedef int (elevator_may_queue_fn) (struct request_queue *, int, int); +typedef int (elevator_may_queue_fn) (struct request_queue *, unsigned int); typedef void (elevator_init_icq_fn) (struct io_cq *); typedef void (elevator_exit_icq_fn) (struct io_cq *); @@ -108,6 +108,11 @@ struct elevator_type #define ELV_HASH_BITS 6 +void elv_rqhash_del(struct request_queue *q, struct request *rq); +void elv_rqhash_add(struct request_queue *q, struct request *rq); +void elv_rqhash_reposition(struct request_queue *q, struct request *rq); +struct request *elv_rqhash_find(struct request_queue *q, sector_t offset); + /* * each queue has an elevator_queue associated with it */ @@ -139,7 +144,7 @@ extern struct request *elv_former_request(struct request_queue *, struct request extern struct request *elv_latter_request(struct request_queue *, struct request *); extern int elv_register_queue(struct request_queue *q); extern void elv_unregister_queue(struct request_queue *q); -extern int elv_may_queue(struct request_queue *, int, int); +extern int elv_may_queue(struct request_queue *, unsigned int); extern void elv_completed_request(struct request_queue *, struct request *); extern int elv_set_request(struct request_queue *q, struct request *rq, struct bio *bio, gfp_t gfp_mask); diff --git a/include/linux/fs.h b/include/linux/fs.h index dc0478c07b2a..b84230e070be 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -28,7 +28,6 @@ #include <linux/uidgid.h> #include <linux/lockdep.h> #include <linux/percpu-rwsem.h> -#include <linux/blk_types.h> #include <linux/workqueue.h> #include <linux/percpu-rwsem.h> #include <linux/delayed_call.h> @@ -38,6 +37,7 @@ struct backing_dev_info; struct bdi_writeback; +struct bio; struct export_operations; struct hd_geometry; struct iovec; @@ -152,58 +152,6 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, #define CHECK_IOVEC_ONLY -1 /* - * The below are the various read and write flags that we support. Some of - * them include behavioral modifiers that send information down to the - * block layer and IO scheduler. They should be used along with a req_op. - * Terminology: - * - * The block layer uses device plugging to defer IO a little bit, in - * the hope that we will see more IO very shortly. This increases - * coalescing of adjacent IO and thus reduces the number of IOs we - * have to send to the device. It also allows for better queuing, - * if the IO isn't mergeable. If the caller is going to be waiting - * for the IO, then he must ensure that the device is unplugged so - * that the IO is dispatched to the driver. - * - * All IO is handled async in Linux. This is fine for background - * writes, but for reads or writes that someone waits for completion - * on, we want to notify the block layer and IO scheduler so that they - * know about it. That allows them to make better scheduling - * decisions. So when the below references 'sync' and 'async', it - * is referencing this priority hint. - * - * With that in mind, the available types are: - * - * READ A normal read operation. Device will be plugged. - * READ_SYNC A synchronous read. Device is not plugged, caller can - * immediately wait on this read without caring about - * unplugging. - * WRITE A normal async write. Device will be plugged. - * WRITE_SYNC Synchronous write. Identical to WRITE, but passes down - * the hint that someone will be waiting on this IO - * shortly. The write equivalent of READ_SYNC. - * WRITE_ODIRECT Special case write for O_DIRECT only. - * WRITE_FLUSH Like WRITE_SYNC but with preceding cache flush. - * WRITE_FUA Like WRITE_SYNC but data is guaranteed to be on - * non-volatile media on completion. - * WRITE_FLUSH_FUA Combination of WRITE_FLUSH and FUA. The IO is preceded - * by a cache flush and data is guaranteed to be on - * non-volatile media on completion. - * - */ -#define RW_MASK REQ_OP_WRITE - -#define READ REQ_OP_READ -#define WRITE REQ_OP_WRITE - -#define READ_SYNC REQ_SYNC -#define WRITE_SYNC (REQ_SYNC | REQ_NOIDLE) -#define WRITE_ODIRECT REQ_SYNC -#define WRITE_FLUSH (REQ_SYNC | REQ_NOIDLE | REQ_PREFLUSH) -#define WRITE_FUA (REQ_SYNC | REQ_NOIDLE | REQ_FUA) -#define WRITE_FLUSH_FUA (REQ_SYNC | REQ_NOIDLE | REQ_PREFLUSH | REQ_FUA) - -/* * Attribute flags. These should be or-ed together to figure out what * has been changed! */ @@ -2499,19 +2447,6 @@ extern void make_bad_inode(struct inode *); extern bool is_bad_inode(struct inode *); #ifdef CONFIG_BLOCK -static inline bool op_is_write(unsigned int op) -{ - return op == REQ_OP_READ ? false : true; -} - -/* - * return data direction, READ or WRITE - */ -static inline int bio_data_dir(struct bio *bio) -{ - return op_is_write(bio_op(bio)) ? WRITE : READ; -} - extern void check_disk_size_change(struct gendisk *disk, struct block_device *bdev); extern int revalidate_disk(struct gendisk *); @@ -2782,7 +2717,6 @@ static inline void remove_inode_hash(struct inode *inode) extern void inode_sb_list_add(struct inode *inode); #ifdef CONFIG_BLOCK -extern blk_qc_t submit_bio(struct bio *); extern int bdev_read_only(struct block_device *); #endif extern int set_blocksize(struct block_device *, int); diff --git a/include/linux/kernel.h b/include/linux/kernel.h index bc6ed52a39b9..01b6b460c34d 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -50,6 +50,10 @@ #define PTR_ALIGN(p, a) ((typeof(p))ALIGN((unsigned long)(p), (a))) #define IS_ALIGNED(x, a) (((x) & ((typeof(x))(a) - 1)) == 0) +/* generic data direction definitions */ +#define READ 0 +#define WRITE 1 + #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]) + __must_be_array(arr)) #define u64_to_user_ptr(x) ( \ diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index d190786e4ad8..7c273bbc5351 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -47,6 +47,7 @@ struct ppa_addr { struct nvm_rq; struct nvm_id; struct nvm_dev; +struct nvm_tgt_dev; typedef int (nvm_l2p_update_fn)(u64, u32, __le64 *, void *); typedef int (nvm_id_fn)(struct nvm_dev *, struct nvm_id *); @@ -107,6 +108,8 @@ enum { NVM_RSP_NOT_CHANGEABLE = 0x1, NVM_RSP_ERR_FAILWRITE = 0x40ff, NVM_RSP_ERR_EMPTYPAGE = 0x42ff, + NVM_RSP_ERR_FAILECC = 0x4281, + NVM_RSP_WARN_HIGHECC = 0x4700, /* Device opcodes */ NVM_OP_HBREAD = 0x02, @@ -208,7 +211,7 @@ struct nvm_id { struct nvm_target { struct list_head list; - struct nvm_dev *dev; + struct nvm_tgt_dev *dev; struct nvm_tgt_type *type; struct gendisk *disk; }; @@ -228,7 +231,7 @@ typedef void (nvm_end_io_fn)(struct nvm_rq *); struct nvm_rq { struct nvm_tgt_instance *ins; - struct nvm_dev *dev; + struct nvm_tgt_dev *dev; struct bio *bio; @@ -263,35 +266,12 @@ static inline void *nvm_rq_to_pdu(struct nvm_rq *rqdata) return rqdata + 1; } -struct nvm_block; - -struct nvm_lun { - int id; - - int lun_id; - int chnl_id; - - spinlock_t lock; - - unsigned int nr_free_blocks; /* Number of unused blocks */ - struct nvm_block *blocks; -}; - enum { NVM_BLK_ST_FREE = 0x1, /* Free block */ NVM_BLK_ST_TGT = 0x2, /* Block in use by target */ NVM_BLK_ST_BAD = 0x8, /* Bad block */ }; -struct nvm_block { - struct list_head list; - struct nvm_lun *lun; - unsigned long id; - - void *priv; - int state; -}; - /* system block cpu representation */ struct nvm_sb_info { unsigned long seqnr; @@ -301,22 +281,12 @@ struct nvm_sb_info { struct ppa_addr fs_ppa; }; -struct nvm_dev { - struct nvm_dev_ops *ops; - - struct list_head devices; - - /* Media manager */ - struct nvmm_type *mt; - void *mp; - - /* System blocks */ - struct nvm_sb_info sb; - - /* Device information */ +/* Device generic information */ +struct nvm_geo { int nr_chnls; + int nr_luns; + int luns_per_chnl; /* -1 if channels are not symmetric */ int nr_planes; - int luns_per_chnl; int sec_per_pg; /* only sectors for a single page */ int pgs_per_blk; int blks_per_lun; @@ -336,14 +306,44 @@ struct nvm_dev { int sec_per_pl; /* all sectors across planes */ int sec_per_blk; int sec_per_lun; +}; + +struct nvm_tgt_dev { + /* Device information */ + struct nvm_geo geo; + + /* Base ppas for target LUNs */ + struct ppa_addr *luns; + + sector_t total_secs; + + struct nvm_id identity; + struct request_queue *q; + + struct nvm_dev *parent; + void *map; +}; + +struct nvm_dev { + struct nvm_dev_ops *ops; + + struct list_head devices; + + /* Media manager */ + struct nvmm_type *mt; + void *mp; + + /* System blocks */ + struct nvm_sb_info sb; + + /* Device information */ + struct nvm_geo geo; /* lower page table */ int lps_per_blk; int *lptbl; - unsigned long total_blocks; unsigned long total_secs; - int nr_luns; unsigned long *lun_map; void *dma_pool; @@ -352,26 +352,57 @@ struct nvm_dev { /* Backend device */ struct request_queue *q; - struct device dev; - struct device *parent_dev; char name[DISK_NAME_LEN]; void *private_data; + void *rmap; + struct mutex mlock; spinlock_t lock; }; +static inline struct ppa_addr linear_to_generic_addr(struct nvm_geo *geo, + u64 pba) +{ + struct ppa_addr l; + int secs, pgs, blks, luns; + sector_t ppa = pba; + + l.ppa = 0; + + div_u64_rem(ppa, geo->sec_per_pg, &secs); + l.g.sec = secs; + + sector_div(ppa, geo->sec_per_pg); + div_u64_rem(ppa, geo->pgs_per_blk, &pgs); + l.g.pg = pgs; + + sector_div(ppa, geo->pgs_per_blk); + div_u64_rem(ppa, geo->blks_per_lun, &blks); + l.g.blk = blks; + + sector_div(ppa, geo->blks_per_lun); + div_u64_rem(ppa, geo->luns_per_chnl, &luns); + l.g.lun = luns; + + sector_div(ppa, geo->luns_per_chnl); + l.g.ch = ppa; + + return l; +} + static inline struct ppa_addr generic_to_dev_addr(struct nvm_dev *dev, struct ppa_addr r) { + struct nvm_geo *geo = &dev->geo; struct ppa_addr l; - l.ppa = ((u64)r.g.blk) << dev->ppaf.blk_offset; - l.ppa |= ((u64)r.g.pg) << dev->ppaf.pg_offset; - l.ppa |= ((u64)r.g.sec) << dev->ppaf.sect_offset; - l.ppa |= ((u64)r.g.pl) << dev->ppaf.pln_offset; - l.ppa |= ((u64)r.g.lun) << dev->ppaf.lun_offset; - l.ppa |= ((u64)r.g.ch) << dev->ppaf.ch_offset; + l.ppa = ((u64)r.g.blk) << geo->ppaf.blk_offset; + l.ppa |= ((u64)r.g.pg) << geo->ppaf.pg_offset; + l.ppa |= ((u64)r.g.sec) << geo->ppaf.sect_offset; + l.ppa |= ((u64)r.g.pl) << geo->ppaf.pln_offset; + l.ppa |= ((u64)r.g.lun) << geo->ppaf.lun_offset; + l.ppa |= ((u64)r.g.ch) << geo->ppaf.ch_offset; return l; } @@ -379,24 +410,25 @@ static inline struct ppa_addr generic_to_dev_addr(struct nvm_dev *dev, static inline struct ppa_addr dev_to_generic_addr(struct nvm_dev *dev, struct ppa_addr r) { + struct nvm_geo *geo = &dev->geo; struct ppa_addr l; l.ppa = 0; /* * (r.ppa << X offset) & X len bitmask. X eq. blk, pg, etc. */ - l.g.blk = (r.ppa >> dev->ppaf.blk_offset) & - (((1 << dev->ppaf.blk_len) - 1)); - l.g.pg |= (r.ppa >> dev->ppaf.pg_offset) & - (((1 << dev->ppaf.pg_len) - 1)); - l.g.sec |= (r.ppa >> dev->ppaf.sect_offset) & - (((1 << dev->ppaf.sect_len) - 1)); - l.g.pl |= (r.ppa >> dev->ppaf.pln_offset) & - (((1 << dev->ppaf.pln_len) - 1)); - l.g.lun |= (r.ppa >> dev->ppaf.lun_offset) & - (((1 << dev->ppaf.lun_len) - 1)); - l.g.ch |= (r.ppa >> dev->ppaf.ch_offset) & - (((1 << dev->ppaf.ch_len) - 1)); + l.g.blk = (r.ppa >> geo->ppaf.blk_offset) & + (((1 << geo->ppaf.blk_len) - 1)); + l.g.pg |= (r.ppa >> geo->ppaf.pg_offset) & + (((1 << geo->ppaf.pg_len) - 1)); + l.g.sec |= (r.ppa >> geo->ppaf.sect_offset) & + (((1 << geo->ppaf.sect_len) - 1)); + l.g.pl |= (r.ppa >> geo->ppaf.pln_offset) & + (((1 << geo->ppaf.pln_len) - 1)); + l.g.lun |= (r.ppa >> geo->ppaf.lun_offset) & + (((1 << geo->ppaf.lun_len) - 1)); + l.g.ch |= (r.ppa >> geo->ppaf.ch_offset) & + (((1 << geo->ppaf.ch_len) - 1)); return l; } @@ -411,18 +443,13 @@ static inline void ppa_set_empty(struct ppa_addr *ppa_addr) ppa_addr->ppa = ADDR_EMPTY; } -static inline struct ppa_addr block_to_ppa(struct nvm_dev *dev, - struct nvm_block *blk) +static inline int ppa_cmp_blk(struct ppa_addr ppa1, struct ppa_addr ppa2) { - struct ppa_addr ppa; - struct nvm_lun *lun = blk->lun; - - ppa.ppa = 0; - ppa.g.blk = blk->id % dev->blks_per_lun; - ppa.g.lun = lun->lun_id; - ppa.g.ch = lun->chnl_id; + if (ppa_empty(ppa1) || ppa_empty(ppa2)) + return 0; - return ppa; + return ((ppa1.g.ch == ppa2.g.ch) && (ppa1.g.lun == ppa2.g.lun) && + (ppa1.g.blk == ppa2.g.blk)); } static inline int ppa_to_slc(struct nvm_dev *dev, int slc_pg) @@ -432,7 +459,7 @@ static inline int ppa_to_slc(struct nvm_dev *dev, int slc_pg) typedef blk_qc_t (nvm_tgt_make_rq_fn)(struct request_queue *, struct bio *); typedef sector_t (nvm_tgt_capacity_fn)(void *); -typedef void *(nvm_tgt_init_fn)(struct nvm_dev *, struct gendisk *, int, int); +typedef void *(nvm_tgt_init_fn)(struct nvm_tgt_dev *, struct gendisk *); typedef void (nvm_tgt_exit_fn)(void *); struct nvm_tgt_type { @@ -465,23 +492,18 @@ typedef void (nvmm_unregister_fn)(struct nvm_dev *); typedef int (nvmm_create_tgt_fn)(struct nvm_dev *, struct nvm_ioctl_create *); typedef int (nvmm_remove_tgt_fn)(struct nvm_dev *, struct nvm_ioctl_remove *); -typedef struct nvm_block *(nvmm_get_blk_fn)(struct nvm_dev *, - struct nvm_lun *, unsigned long); -typedef void (nvmm_put_blk_fn)(struct nvm_dev *, struct nvm_block *); -typedef int (nvmm_open_blk_fn)(struct nvm_dev *, struct nvm_block *); -typedef int (nvmm_close_blk_fn)(struct nvm_dev *, struct nvm_block *); -typedef void (nvmm_flush_blk_fn)(struct nvm_dev *, struct nvm_block *); -typedef int (nvmm_submit_io_fn)(struct nvm_dev *, struct nvm_rq *); -typedef int (nvmm_erase_blk_fn)(struct nvm_dev *, struct nvm_block *, - unsigned long); -typedef void (nvmm_mark_blk_fn)(struct nvm_dev *, struct ppa_addr, int); -typedef struct nvm_lun *(nvmm_get_lun_fn)(struct nvm_dev *, int); -typedef int (nvmm_reserve_lun)(struct nvm_dev *, int); -typedef void (nvmm_release_lun)(struct nvm_dev *, int); -typedef void (nvmm_lun_info_print_fn)(struct nvm_dev *); - +typedef int (nvmm_submit_io_fn)(struct nvm_tgt_dev *, struct nvm_rq *); +typedef int (nvmm_erase_blk_fn)(struct nvm_tgt_dev *, struct ppa_addr *, int); typedef int (nvmm_get_area_fn)(struct nvm_dev *, sector_t *, sector_t); typedef void (nvmm_put_area_fn)(struct nvm_dev *, sector_t); +typedef struct ppa_addr (nvmm_trans_ppa_fn)(struct nvm_tgt_dev *, + struct ppa_addr, int); +typedef void (nvmm_part_to_tgt_fn)(struct nvm_dev *, sector_t*, int); + +enum { + TRANS_TGT_TO_DEV = 0x0, + TRANS_DEV_TO_TGT = 0x1, +}; struct nvmm_type { const char *name; @@ -493,54 +515,41 @@ struct nvmm_type { nvmm_create_tgt_fn *create_tgt; nvmm_remove_tgt_fn *remove_tgt; - /* Block administration callbacks */ - nvmm_get_blk_fn *get_blk; - nvmm_put_blk_fn *put_blk; - nvmm_open_blk_fn *open_blk; - nvmm_close_blk_fn *close_blk; - nvmm_flush_blk_fn *flush_blk; - nvmm_submit_io_fn *submit_io; nvmm_erase_blk_fn *erase_blk; - /* Bad block mgmt */ - nvmm_mark_blk_fn *mark_blk; - - /* Configuration management */ - nvmm_get_lun_fn *get_lun; - nvmm_reserve_lun *reserve_lun; - nvmm_release_lun *release_lun; - - /* Statistics */ - nvmm_lun_info_print_fn *lun_info_print; - nvmm_get_area_fn *get_area; nvmm_put_area_fn *put_area; + nvmm_trans_ppa_fn *trans_ppa; + nvmm_part_to_tgt_fn *part_to_tgt; + struct list_head list; }; extern int nvm_register_mgr(struct nvmm_type *); extern void nvm_unregister_mgr(struct nvmm_type *); -extern struct nvm_block *nvm_get_blk(struct nvm_dev *, struct nvm_lun *, - unsigned long); -extern void nvm_put_blk(struct nvm_dev *, struct nvm_block *); - extern struct nvm_dev *nvm_alloc_dev(int); extern int nvm_register(struct nvm_dev *); extern void nvm_unregister(struct nvm_dev *); -void nvm_mark_blk(struct nvm_dev *dev, struct ppa_addr ppa, int type); - -extern int nvm_submit_io(struct nvm_dev *, struct nvm_rq *); +extern int nvm_set_bb_tbl(struct nvm_dev *, struct ppa_addr *, int, int); +extern int nvm_set_tgt_bb_tbl(struct nvm_tgt_dev *, struct ppa_addr *, + int, int); +extern int nvm_max_phys_sects(struct nvm_tgt_dev *); +extern int nvm_submit_io(struct nvm_tgt_dev *, struct nvm_rq *); extern void nvm_generic_to_addr_mode(struct nvm_dev *, struct nvm_rq *); extern void nvm_addr_to_generic_mode(struct nvm_dev *, struct nvm_rq *); extern int nvm_set_rqd_ppalist(struct nvm_dev *, struct nvm_rq *, const struct ppa_addr *, int, int); extern void nvm_free_rqd_ppalist(struct nvm_dev *, struct nvm_rq *); -extern int nvm_erase_ppa(struct nvm_dev *, struct ppa_addr *, int); -extern int nvm_erase_blk(struct nvm_dev *, struct nvm_block *); +extern int nvm_erase_ppa(struct nvm_dev *, struct ppa_addr *, int, int); +extern int nvm_erase_blk(struct nvm_tgt_dev *, struct ppa_addr *, int); +extern int nvm_get_l2p_tbl(struct nvm_tgt_dev *, u64, u32, nvm_l2p_update_fn *, + void *); +extern int nvm_get_area(struct nvm_tgt_dev *, sector_t *, sector_t); +extern void nvm_put_area(struct nvm_tgt_dev *, sector_t); extern void nvm_end_io(struct nvm_rq *, int); extern int nvm_submit_ppa(struct nvm_dev *, struct ppa_addr *, int, int, int, void *, int); @@ -548,6 +557,7 @@ extern int nvm_submit_ppa_list(struct nvm_dev *, struct ppa_addr *, int, int, int, void *, int); extern int nvm_bb_tbl_fold(struct nvm_dev *, u8 *, int); extern int nvm_get_bb_tbl(struct nvm_dev *, struct ppa_addr, u8 *); +extern int nvm_get_tgt_bb_tbl(struct nvm_tgt_dev *, struct ppa_addr, u8 *); /* sysblk.c */ #define NVM_SYSBLK_MAGIC 0x4E564D53 /* "NVMS" */ @@ -569,10 +579,10 @@ extern int nvm_init_sysblock(struct nvm_dev *, struct nvm_sb_info *); extern int nvm_dev_factory(struct nvm_dev *, int flags); -#define nvm_for_each_lun_ppa(dev, ppa, chid, lunid) \ - for ((chid) = 0, (ppa).ppa = 0; (chid) < (dev)->nr_chnls; \ +#define nvm_for_each_lun_ppa(geo, ppa, chid, lunid) \ + for ((chid) = 0, (ppa).ppa = 0; (chid) < (geo)->nr_chnls; \ (chid)++, (ppa).g.ch = (chid)) \ - for ((lunid) = 0; (lunid) < (dev)->luns_per_chnl; \ + for ((lunid) = 0; (lunid) < (geo)->luns_per_chnl; \ (lunid)++, (ppa).g.lun = (lunid)) #else /* CONFIG_NVM */ diff --git a/include/linux/nvme-fc-driver.h b/include/linux/nvme-fc-driver.h new file mode 100644 index 000000000000..f21471f7ee40 --- /dev/null +++ b/include/linux/nvme-fc-driver.h @@ -0,0 +1,851 @@ +/* + * Copyright (c) 2016, Avago Technologies + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#ifndef _NVME_FC_DRIVER_H +#define _NVME_FC_DRIVER_H 1 + + +/* + * ********************** LLDD FC-NVME Host API ******************** + * + * For FC LLDD's that are the NVME Host role. + * + * ****************************************************************** + */ + + + +/* FC Port role bitmask - can merge with FC Port Roles in fc transport */ +#define FC_PORT_ROLE_NVME_INITIATOR 0x10 +#define FC_PORT_ROLE_NVME_TARGET 0x11 +#define FC_PORT_ROLE_NVME_DISCOVERY 0x12 + + +/** + * struct nvme_fc_port_info - port-specific ids and FC connection-specific + * data element used during NVME Host role + * registrations + * + * Static fields describing the port being registered: + * @node_name: FC WWNN for the port + * @port_name: FC WWPN for the port + * @port_role: What NVME roles are supported (see FC_PORT_ROLE_xxx) + * + * Initialization values for dynamic port fields: + * @port_id: FC N_Port_ID currently assigned the port. Upper 8 bits must + * be set to 0. + */ +struct nvme_fc_port_info { + u64 node_name; + u64 port_name; + u32 port_role; + u32 port_id; +}; + + +/** + * struct nvmefc_ls_req - Request structure passed from NVME-FC transport + * to LLDD in order to perform a NVME FC-4 LS + * request and obtain a response. + * + * Values set by the NVME-FC layer prior to calling the LLDD ls_req + * entrypoint. + * @rqstaddr: pointer to request buffer + * @rqstdma: PCI DMA address of request buffer + * @rqstlen: Length, in bytes, of request buffer + * @rspaddr: pointer to response buffer + * @rspdma: PCI DMA address of response buffer + * @rsplen: Length, in bytes, of response buffer + * @timeout: Maximum amount of time, in seconds, to wait for the LS response. + * If timeout exceeded, LLDD to abort LS exchange and complete + * LS request with error status. + * @private: pointer to memory allocated alongside the ls request structure + * that is specifically for the LLDD to use while processing the + * request. The length of the buffer corresponds to the + * lsrqst_priv_sz value specified in the nvme_fc_port_template + * supplied by the LLDD. + * @done: The callback routine the LLDD is to invoke upon completion of + * the LS request. req argument is the pointer to the original LS + * request structure. Status argument must be 0 upon success, a + * negative errno on failure (example: -ENXIO). + */ +struct nvmefc_ls_req { + void *rqstaddr; + dma_addr_t rqstdma; + u32 rqstlen; + void *rspaddr; + dma_addr_t rspdma; + u32 rsplen; + u32 timeout; + + void *private; + + void (*done)(struct nvmefc_ls_req *req, int status); + +} __aligned(sizeof(u64)); /* alignment for other things alloc'd with */ + + +enum nvmefc_fcp_datadir { + NVMEFC_FCP_NODATA, /* payload_length and sg_cnt will be zero */ + NVMEFC_FCP_WRITE, + NVMEFC_FCP_READ, +}; + + +#define NVME_FC_MAX_SEGMENTS 256 + +/** + * struct nvmefc_fcp_req - Request structure passed from NVME-FC transport + * to LLDD in order to perform a NVME FCP IO operation. + * + * Values set by the NVME-FC layer prior to calling the LLDD fcp_io + * entrypoint. + * @cmdaddr: pointer to the FCP CMD IU buffer + * @rspaddr: pointer to the FCP RSP IU buffer + * @cmddma: PCI DMA address of the FCP CMD IU buffer + * @rspdma: PCI DMA address of the FCP RSP IU buffer + * @cmdlen: Length, in bytes, of the FCP CMD IU buffer + * @rsplen: Length, in bytes, of the FCP RSP IU buffer + * @payload_length: Length of DATA_IN or DATA_OUT payload data to transfer + * @sg_table: scatter/gather structure for payload data + * @first_sgl: memory for 1st scatter/gather list segment for payload data + * @sg_cnt: number of elements in the scatter/gather list + * @io_dir: direction of the FCP request (see NVMEFC_FCP_xxx) + * @sqid: The nvme SQID the command is being issued on + * @done: The callback routine the LLDD is to invoke upon completion of + * the FCP operation. req argument is the pointer to the original + * FCP IO operation. + * @private: pointer to memory allocated alongside the FCP operation + * request structure that is specifically for the LLDD to use + * while processing the operation. The length of the buffer + * corresponds to the fcprqst_priv_sz value specified in the + * nvme_fc_port_template supplied by the LLDD. + * + * Values set by the LLDD indicating completion status of the FCP operation. + * Must be set prior to calling the done() callback. + * @transferred_length: amount of payload data, in bytes, that were + * transferred. Should equal payload_length on success. + * @rcv_rsplen: length, in bytes, of the FCP RSP IU received. + * @status: Completion status of the FCP operation. must be 0 upon success, + * NVME_SC_FC_xxx value upon failure. Note: this is NOT a + * reflection of the NVME CQE completion status. Only the status + * of the FCP operation at the NVME-FC level. + */ +struct nvmefc_fcp_req { + void *cmdaddr; + void *rspaddr; + dma_addr_t cmddma; + dma_addr_t rspdma; + u16 cmdlen; + u16 rsplen; + + u32 payload_length; + struct sg_table sg_table; + struct scatterlist *first_sgl; + int sg_cnt; + enum nvmefc_fcp_datadir io_dir; + + __le16 sqid; + + void (*done)(struct nvmefc_fcp_req *req); + + void *private; + + u32 transferred_length; + u16 rcv_rsplen; + u32 status; +} __aligned(sizeof(u64)); /* alignment for other things alloc'd with */ + + +/* + * Direct copy of fc_port_state enum. For later merging + */ +enum nvme_fc_obj_state { + FC_OBJSTATE_UNKNOWN, + FC_OBJSTATE_NOTPRESENT, + FC_OBJSTATE_ONLINE, + FC_OBJSTATE_OFFLINE, /* User has taken Port Offline */ + FC_OBJSTATE_BLOCKED, + FC_OBJSTATE_BYPASSED, + FC_OBJSTATE_DIAGNOSTICS, + FC_OBJSTATE_LINKDOWN, + FC_OBJSTATE_ERROR, + FC_OBJSTATE_LOOPBACK, + FC_OBJSTATE_DELETED, +}; + + +/** + * struct nvme_fc_local_port - structure used between NVME-FC transport and + * a LLDD to reference a local NVME host port. + * Allocated/created by the nvme_fc_register_localport() + * transport interface. + * + * Fields with static values for the port. Initialized by the + * port_info struct supplied to the registration call. + * @port_num: NVME-FC transport host port number + * @port_role: NVME roles are supported on the port (see FC_PORT_ROLE_xxx) + * @node_name: FC WWNN for the port + * @port_name: FC WWPN for the port + * @private: pointer to memory allocated alongside the local port + * structure that is specifically for the LLDD to use. + * The length of the buffer corresponds to the local_priv_sz + * value specified in the nvme_fc_port_template supplied by + * the LLDD. + * + * Fields with dynamic values. Values may change base on link state. LLDD + * may reference fields directly to change them. Initialized by the + * port_info struct supplied to the registration call. + * @port_id: FC N_Port_ID currently assigned the port. Upper 8 bits must + * be set to 0. + * @port_state: Operational state of the port. + */ +struct nvme_fc_local_port { + /* static/read-only fields */ + u32 port_num; + u32 port_role; + u64 node_name; + u64 port_name; + + void *private; + + /* dynamic fields */ + u32 port_id; + enum nvme_fc_obj_state port_state; +} __aligned(sizeof(u64)); /* alignment for other things alloc'd with */ + + +/** + * struct nvme_fc_remote_port - structure used between NVME-FC transport and + * a LLDD to reference a remote NVME subsystem port. + * Allocated/created by the nvme_fc_register_remoteport() + * transport interface. + * + * Fields with static values for the port. Initialized by the + * port_info struct supplied to the registration call. + * @port_num: NVME-FC transport remote subsystem port number + * @port_role: NVME roles are supported on the port (see FC_PORT_ROLE_xxx) + * @node_name: FC WWNN for the port + * @port_name: FC WWPN for the port + * @localport: pointer to the NVME-FC local host port the subsystem is + * connected to. + * @private: pointer to memory allocated alongside the remote port + * structure that is specifically for the LLDD to use. + * The length of the buffer corresponds to the remote_priv_sz + * value specified in the nvme_fc_port_template supplied by + * the LLDD. + * + * Fields with dynamic values. Values may change base on link or login + * state. LLDD may reference fields directly to change them. Initialized by + * the port_info struct supplied to the registration call. + * @port_id: FC N_Port_ID currently assigned the port. Upper 8 bits must + * be set to 0. + * @port_state: Operational state of the remote port. Valid values are + * ONLINE or UNKNOWN. + */ +struct nvme_fc_remote_port { + /* static fields */ + u32 port_num; + u32 port_role; + u64 node_name; + u64 port_name; + + struct nvme_fc_local_port *localport; + + void *private; + + /* dynamic fields */ + u32 port_id; + enum nvme_fc_obj_state port_state; +} __aligned(sizeof(u64)); /* alignment for other things alloc'd with */ + + +/** + * struct nvme_fc_port_template - structure containing static entrypoints and + * operational parameters for an LLDD that supports NVME host + * behavior. Passed by reference in port registrations. + * NVME-FC transport remembers template reference and may + * access it during runtime operation. + * + * Host/Initiator Transport Entrypoints/Parameters: + * + * @localport_delete: The LLDD initiates deletion of a localport via + * nvme_fc_deregister_localport(). However, the teardown is + * asynchronous. This routine is called upon the completion of the + * teardown to inform the LLDD that the localport has been deleted. + * Entrypoint is Mandatory. + * + * @remoteport_delete: The LLDD initiates deletion of a remoteport via + * nvme_fc_deregister_remoteport(). However, the teardown is + * asynchronous. This routine is called upon the completion of the + * teardown to inform the LLDD that the remoteport has been deleted. + * Entrypoint is Mandatory. + * + * @create_queue: Upon creating a host<->controller association, queues are + * created such that they can be affinitized to cpus/cores. This + * callback into the LLDD to notify that a controller queue is being + * created. The LLDD may choose to allocate an associated hw queue + * or map it onto a shared hw queue. Upon return from the call, the + * LLDD specifies a handle that will be given back to it for any + * command that is posted to the controller queue. The handle can + * be used by the LLDD to map quickly to the proper hw queue for + * command execution. The mask of cpu's that will map to this queue + * at the block-level is also passed in. The LLDD should use the + * queue id and/or cpu masks to ensure proper affinitization of the + * controller queue to the hw queue. + * Entrypoint is Optional. + * + * @delete_queue: This is the inverse of the crete_queue. During + * host<->controller association teardown, this routine is called + * when a controller queue is being terminated. Any association with + * a hw queue should be termined. If there is a unique hw queue, the + * hw queue should be torn down. + * Entrypoint is Optional. + * + * @poll_queue: Called to poll for the completion of an io on a blk queue. + * Entrypoint is Optional. + * + * @ls_req: Called to issue a FC-NVME FC-4 LS service request. + * The nvme_fc_ls_req structure will fully describe the buffers for + * the request payload and where to place the response payload. The + * LLDD is to allocate an exchange, issue the LS request, obtain the + * LS response, and call the "done" routine specified in the request + * structure (argument to done is the ls request structure itself). + * Entrypoint is Mandatory. + * + * @fcp_io: called to issue a FC-NVME I/O request. The I/O may be for + * an admin queue or an i/o queue. The nvmefc_fcp_req structure will + * fully describe the io: the buffer containing the FC-NVME CMD IU + * (which contains the SQE), the sg list for the payload if applicable, + * and the buffer to place the FC-NVME RSP IU into. The LLDD will + * complete the i/o, indicating the amount of data transferred or + * any transport error, and call the "done" routine specified in the + * request structure (argument to done is the fcp request structure + * itself). + * Entrypoint is Mandatory. + * + * @ls_abort: called to request the LLDD to abort the indicated ls request. + * The call may return before the abort has completed. After aborting + * the request, the LLDD must still call the ls request done routine + * indicating an FC transport Aborted status. + * Entrypoint is Mandatory. + * + * @fcp_abort: called to request the LLDD to abort the indicated fcp request. + * The call may return before the abort has completed. After aborting + * the request, the LLDD must still call the fcp request done routine + * indicating an FC transport Aborted status. + * Entrypoint is Mandatory. + * + * @max_hw_queues: indicates the maximum number of hw queues the LLDD + * supports for cpu affinitization. + * Value is Mandatory. Must be at least 1. + * + * @max_sgl_segments: indicates the maximum number of sgl segments supported + * by the LLDD + * Value is Mandatory. Must be at least 1. Recommend at least 256. + * + * @max_dif_sgl_segments: indicates the maximum number of sgl segments + * supported by the LLDD for DIF operations. + * Value is Mandatory. Must be at least 1. Recommend at least 256. + * + * @dma_boundary: indicates the dma address boundary where dma mappings + * will be split across. + * Value is Mandatory. Typical value is 0xFFFFFFFF to split across + * 4Gig address boundarys + * + * @local_priv_sz: The LLDD sets this field to the amount of additional + * memory that it would like fc nvme layer to allocate on the LLDD's + * behalf whenever a localport is allocated. The additional memory + * area solely for the of the LLDD and its location is specified by + * the localport->private pointer. + * Value is Mandatory. Allowed to be zero. + * + * @remote_priv_sz: The LLDD sets this field to the amount of additional + * memory that it would like fc nvme layer to allocate on the LLDD's + * behalf whenever a remoteport is allocated. The additional memory + * area solely for the of the LLDD and its location is specified by + * the remoteport->private pointer. + * Value is Mandatory. Allowed to be zero. + * + * @lsrqst_priv_sz: The LLDD sets this field to the amount of additional + * memory that it would like fc nvme layer to allocate on the LLDD's + * behalf whenever a ls request structure is allocated. The additional + * memory area solely for the of the LLDD and its location is + * specified by the ls_request->private pointer. + * Value is Mandatory. Allowed to be zero. + * + * @fcprqst_priv_sz: The LLDD sets this field to the amount of additional + * memory that it would like fc nvme layer to allocate on the LLDD's + * behalf whenever a fcp request structure is allocated. The additional + * memory area solely for the of the LLDD and its location is + * specified by the fcp_request->private pointer. + * Value is Mandatory. Allowed to be zero. + */ +struct nvme_fc_port_template { + /* initiator-based functions */ + void (*localport_delete)(struct nvme_fc_local_port *); + void (*remoteport_delete)(struct nvme_fc_remote_port *); + int (*create_queue)(struct nvme_fc_local_port *, + unsigned int qidx, u16 qsize, + void **handle); + void (*delete_queue)(struct nvme_fc_local_port *, + unsigned int qidx, void *handle); + void (*poll_queue)(struct nvme_fc_local_port *, void *handle); + int (*ls_req)(struct nvme_fc_local_port *, + struct nvme_fc_remote_port *, + struct nvmefc_ls_req *); + int (*fcp_io)(struct nvme_fc_local_port *, + struct nvme_fc_remote_port *, + void *hw_queue_handle, + struct nvmefc_fcp_req *); + void (*ls_abort)(struct nvme_fc_local_port *, + struct nvme_fc_remote_port *, + struct nvmefc_ls_req *); + void (*fcp_abort)(struct nvme_fc_local_port *, + struct nvme_fc_remote_port *, + void *hw_queue_handle, + struct nvmefc_fcp_req *); + + u32 max_hw_queues; + u16 max_sgl_segments; + u16 max_dif_sgl_segments; + u64 dma_boundary; + + /* sizes of additional private data for data structures */ + u32 local_priv_sz; + u32 remote_priv_sz; + u32 lsrqst_priv_sz; + u32 fcprqst_priv_sz; +}; + + +/* + * Initiator/Host functions + */ + +int nvme_fc_register_localport(struct nvme_fc_port_info *pinfo, + struct nvme_fc_port_template *template, + struct device *dev, + struct nvme_fc_local_port **lport_p); + +int nvme_fc_unregister_localport(struct nvme_fc_local_port *localport); + +int nvme_fc_register_remoteport(struct nvme_fc_local_port *localport, + struct nvme_fc_port_info *pinfo, + struct nvme_fc_remote_port **rport_p); + +int nvme_fc_unregister_remoteport(struct nvme_fc_remote_port *remoteport); + + + +/* + * *************** LLDD FC-NVME Target/Subsystem API *************** + * + * For FC LLDD's that are the NVME Subsystem role + * + * ****************************************************************** + */ + +/** + * struct nvmet_fc_port_info - port-specific ids and FC connection-specific + * data element used during NVME Subsystem role + * registrations + * + * Static fields describing the port being registered: + * @node_name: FC WWNN for the port + * @port_name: FC WWPN for the port + * + * Initialization values for dynamic port fields: + * @port_id: FC N_Port_ID currently assigned the port. Upper 8 bits must + * be set to 0. + */ +struct nvmet_fc_port_info { + u64 node_name; + u64 port_name; + u32 port_id; +}; + + +/** + * struct nvmefc_tgt_ls_req - Structure used between LLDD and NVMET-FC + * layer to represent the exchange context for + * a FC-NVME Link Service (LS). + * + * The structure is allocated by the LLDD whenever a LS Request is received + * from the FC link. The address of the structure is passed to the nvmet-fc + * layer via the nvmet_fc_rcv_ls_req() call. The address of the structure + * will be passed back to the LLDD when the response is to be transmit. + * The LLDD is to use the address to map back to the LLDD exchange structure + * which maintains information such as the targetport the LS was received + * on, the remote FC NVME initiator that sent the LS, and any FC exchange + * context. Upon completion of the LS response transmit, the address of the + * structure will be passed back to the LS rsp done() routine, allowing the + * nvmet-fc layer to release dma resources. Upon completion of the done() + * routine, no further access will be made by the nvmet-fc layer and the + * LLDD can de-allocate the structure. + * + * Field initialization: + * At the time of the nvmet_fc_rcv_ls_req() call, there is no content that + * is valid in the structure. + * + * When the structure is used for the LLDD->xmt_ls_rsp() call, the nvmet-fc + * layer will fully set the fields in order to specify the response + * payload buffer and its length as well as the done routine to be called + * upon compeletion of the transmit. The nvmet-fc layer will also set a + * private pointer for its own use in the done routine. + * + * Values set by the NVMET-FC layer prior to calling the LLDD xmt_ls_rsp + * entrypoint. + * @rspbuf: pointer to the LS response buffer + * @rspdma: PCI DMA address of the LS response buffer + * @rsplen: Length, in bytes, of the LS response buffer + * @done: The callback routine the LLDD is to invoke upon completion of + * transmitting the LS response. req argument is the pointer to + * the original ls request. + * @nvmet_fc_private: pointer to an internal NVMET-FC layer structure used + * as part of the NVMET-FC processing. The LLDD is not to access + * this pointer. + */ +struct nvmefc_tgt_ls_req { + void *rspbuf; + dma_addr_t rspdma; + u16 rsplen; + + void (*done)(struct nvmefc_tgt_ls_req *req); + void *nvmet_fc_private; /* LLDD is not to access !! */ +}; + +/* Operations that NVME-FC layer may request the LLDD to perform for FCP */ +enum { + NVMET_FCOP_READDATA = 1, /* xmt data to initiator */ + NVMET_FCOP_WRITEDATA = 2, /* xmt data from initiator */ + NVMET_FCOP_READDATA_RSP = 3, /* xmt data to initiator and send + * rsp as well + */ + NVMET_FCOP_RSP = 4, /* send rsp frame */ + NVMET_FCOP_ABORT = 5, /* abort exchange via ABTS */ + NVMET_FCOP_BA_ACC = 6, /* send BA_ACC */ + NVMET_FCOP_BA_RJT = 7, /* send BA_RJT */ +}; + +/** + * struct nvmefc_tgt_fcp_req - Structure used between LLDD and NVMET-FC + * layer to represent the exchange context and + * the specific FC-NVME IU operation(s) to perform + * for a FC-NVME FCP IO. + * + * Structure used between LLDD and nvmet-fc layer to represent the exchange + * context for a FC-NVME FCP I/O operation (e.g. a nvme sqe, the sqe-related + * memory transfers, and its assocated cqe transfer). + * + * The structure is allocated by the LLDD whenever a FCP CMD IU is received + * from the FC link. The address of the structure is passed to the nvmet-fc + * layer via the nvmet_fc_rcv_fcp_req() call. The address of the structure + * will be passed back to the LLDD for the data operations and transmit of + * the response. The LLDD is to use the address to map back to the LLDD + * exchange structure which maintains information such as the targetport + * the FCP I/O was received on, the remote FC NVME initiator that sent the + * FCP I/O, and any FC exchange context. Upon completion of the FCP target + * operation, the address of the structure will be passed back to the FCP + * op done() routine, allowing the nvmet-fc layer to release dma resources. + * Upon completion of the done() routine for either RSP or ABORT ops, no + * further access will be made by the nvmet-fc layer and the LLDD can + * de-allocate the structure. + * + * Field initialization: + * At the time of the nvmet_fc_rcv_fcp_req() call, there is no content that + * is valid in the structure. + * + * When the structure is used for an FCP target operation, the nvmet-fc + * layer will fully set the fields in order to specify the scattergather + * list, the transfer length, as well as the done routine to be called + * upon compeletion of the operation. The nvmet-fc layer will also set a + * private pointer for its own use in the done routine. + * + * Note: the LLDD must never fail a NVMET_FCOP_ABORT request !! + * + * Values set by the NVMET-FC layer prior to calling the LLDD fcp_op + * entrypoint. + * @op: Indicates the FCP IU operation to perform (see NVMET_FCOP_xxx) + * @hwqid: Specifies the hw queue index (0..N-1, where N is the + * max_hw_queues value from the LLD's nvmet_fc_target_template) + * that the operation is to use. + * @offset: Indicates the DATA_OUT/DATA_IN payload offset to be tranferred. + * Field is only valid on WRITEDATA, READDATA, or READDATA_RSP ops. + * @timeout: amount of time, in seconds, to wait for a response from the NVME + * host. A value of 0 is an infinite wait. + * Valid only for the following ops: + * WRITEDATA: caps the wait for data reception + * READDATA_RSP & RSP: caps wait for FCP_CONF reception (if used) + * @transfer_length: the length, in bytes, of the DATA_OUT or DATA_IN payload + * that is to be transferred. + * Valid only for the WRITEDATA, READDATA, or READDATA_RSP ops. + * @ba_rjt: Contains the BA_RJT payload that is to be transferred. + * Valid only for the NVMET_FCOP_BA_RJT op. + * @sg: Scatter/gather list for the DATA_OUT/DATA_IN payload data. + * Valid only for the WRITEDATA, READDATA, or READDATA_RSP ops. + * @sg_cnt: Number of valid entries in the scatter/gather list. + * Valid only for the WRITEDATA, READDATA, or READDATA_RSP ops. + * @rspaddr: pointer to the FCP RSP IU buffer to be transmit + * Used by RSP and READDATA_RSP ops + * @rspdma: PCI DMA address of the FCP RSP IU buffer + * Used by RSP and READDATA_RSP ops + * @rsplen: Length, in bytes, of the FCP RSP IU buffer + * Used by RSP and READDATA_RSP ops + * @done: The callback routine the LLDD is to invoke upon completion of + * the operation. req argument is the pointer to the original + * FCP subsystem op request. + * @nvmet_fc_private: pointer to an internal NVMET-FC layer structure used + * as part of the NVMET-FC processing. The LLDD is not to + * reference this field. + * + * Values set by the LLDD indicating completion status of the FCP operation. + * Must be set prior to calling the done() callback. + * @transferred_length: amount of DATA_OUT payload data received by a + * a WRITEDATA operation. If not a WRITEDATA operation, value must + * be set to 0. Should equal transfer_length on success. + * @fcp_error: status of the FCP operation. Must be 0 on success; on failure + * must be a NVME_SC_FC_xxxx value. + */ +struct nvmefc_tgt_fcp_req { + u8 op; + u16 hwqid; + u32 offset; + u32 timeout; + u32 transfer_length; + struct fc_ba_rjt ba_rjt; + struct scatterlist sg[NVME_FC_MAX_SEGMENTS]; + int sg_cnt; + void *rspaddr; + dma_addr_t rspdma; + u16 rsplen; + + void (*done)(struct nvmefc_tgt_fcp_req *); + + void *nvmet_fc_private; /* LLDD is not to access !! */ + + u32 transferred_length; + int fcp_error; +}; + + +/* Target Features (Bit fields) LLDD supports */ +enum { + NVMET_FCTGTFEAT_READDATA_RSP = (1 << 0), + /* Bit 0: supports the NVMET_FCPOP_READDATA_RSP op, which + * sends (the last) Read Data sequence followed by the RSP + * sequence in one LLDD operation. Errors during Data + * sequence transmit must not allow RSP sequence to be sent. + */ + NVMET_FCTGTFEAT_NEEDS_CMD_CPUSCHED = (1 << 1), + /* Bit 1: When 0, the LLDD will deliver FCP CMD + * on the CPU it should be affinitized to. Thus work will + * be scheduled on the cpu received on. When 1, the LLDD + * may not deliver the CMD on the CPU it should be worked + * on. The transport should pick a cpu to schedule the work + * on. + */ +}; + + +/** + * struct nvmet_fc_target_port - structure used between NVME-FC transport and + * a LLDD to reference a local NVME subsystem port. + * Allocated/created by the nvme_fc_register_targetport() + * transport interface. + * + * Fields with static values for the port. Initialized by the + * port_info struct supplied to the registration call. + * @port_num: NVME-FC transport subsytem port number + * @node_name: FC WWNN for the port + * @port_name: FC WWPN for the port + * @private: pointer to memory allocated alongside the local port + * structure that is specifically for the LLDD to use. + * The length of the buffer corresponds to the target_priv_sz + * value specified in the nvme_fc_target_template supplied by + * the LLDD. + * + * Fields with dynamic values. Values may change base on link state. LLDD + * may reference fields directly to change them. Initialized by the + * port_info struct supplied to the registration call. + * @port_id: FC N_Port_ID currently assigned the port. Upper 8 bits must + * be set to 0. + * @port_state: Operational state of the port. + */ +struct nvmet_fc_target_port { + /* static/read-only fields */ + u32 port_num; + u64 node_name; + u64 port_name; + + void *private; + + /* dynamic fields */ + u32 port_id; + enum nvme_fc_obj_state port_state; +} __aligned(sizeof(u64)); /* alignment for other things alloc'd with */ + + +/** + * struct nvmet_fc_target_template - structure containing static entrypoints + * and operational parameters for an LLDD that supports NVME + * subsystem behavior. Passed by reference in port + * registrations. NVME-FC transport remembers template + * reference and may access it during runtime operation. + * + * Subsystem/Target Transport Entrypoints/Parameters: + * + * @targetport_delete: The LLDD initiates deletion of a targetport via + * nvmet_fc_unregister_targetport(). However, the teardown is + * asynchronous. This routine is called upon the completion of the + * teardown to inform the LLDD that the targetport has been deleted. + * Entrypoint is Mandatory. + * + * @xmt_ls_rsp: Called to transmit the response to a FC-NVME FC-4 LS service. + * The nvmefc_tgt_ls_req structure is the same LLDD-supplied exchange + * structure specified in the nvmet_fc_rcv_ls_req() call made when + * the LS request was received. The structure will fully describe + * the buffers for the response payload and the dma address of the + * payload. The LLDD is to transmit the response (or return a non-zero + * errno status), and upon completion of the transmit, call the + * "done" routine specified in the nvmefc_tgt_ls_req structure + * (argument to done is the ls reqwuest structure itself). + * After calling the done routine, the LLDD shall consider the + * LS handling complete and the nvmefc_tgt_ls_req structure may + * be freed/released. + * Entrypoint is Mandatory. + * + * @fcp_op: Called to perform a data transfer, transmit a response, or + * abort an FCP opertion. The nvmefc_tgt_fcp_req structure is the same + * LLDD-supplied exchange structure specified in the + * nvmet_fc_rcv_fcp_req() call made when the FCP CMD IU was received. + * The op field in the structure shall indicate the operation for + * the LLDD to perform relative to the io. + * NVMET_FCOP_READDATA operation: the LLDD is to send the + * payload data (described by sglist) to the host in 1 or + * more FC sequences (preferrably 1). Note: the fc-nvme layer + * may call the READDATA operation multiple times for longer + * payloads. + * NVMET_FCOP_WRITEDATA operation: the LLDD is to receive the + * payload data (described by sglist) from the host via 1 or + * more FC sequences (preferrably 1). The LLDD is to generate + * the XFER_RDY IU(s) corresponding to the data being requested. + * Note: the FC-NVME layer may call the WRITEDATA operation + * multiple times for longer payloads. + * NVMET_FCOP_READDATA_RSP operation: the LLDD is to send the + * payload data (described by sglist) to the host in 1 or + * more FC sequences (preferrably 1). If an error occurs during + * payload data transmission, the LLDD is to set the + * nvmefc_tgt_fcp_req fcp_error and transferred_length field, then + * consider the operation complete. On error, the LLDD is to not + * transmit the FCP_RSP iu. If all payload data is transferred + * successfully, the LLDD is to update the nvmefc_tgt_fcp_req + * transferred_length field and may subsequently transmit the + * FCP_RSP iu payload (described by rspbuf, rspdma, rsplen). + * The LLDD is to await FCP_CONF reception to confirm the RSP + * reception by the host. The LLDD may retramsit the FCP_RSP iu + * if necessary per FC-NVME. Upon reception of FCP_CONF, or upon + * FCP_CONF failure, the LLDD is to set the nvmefc_tgt_fcp_req + * fcp_error field and consider the operation complete.. + * NVMET_FCOP_RSP: the LLDD is to transmit the FCP_RSP iu payload + * (described by rspbuf, rspdma, rsplen). The LLDD is to await + * FCP_CONF reception to confirm the RSP reception by the host. + * The LLDD may retramsit the FCP_RSP iu if necessary per FC-NVME. + * Upon reception of FCP_CONF, or upon FCP_CONF failure, the + * LLDD is to set the nvmefc_tgt_fcp_req fcp_error field and + * consider the operation complete.. + * NVMET_FCOP_ABORT: the LLDD is to terminate the exchange + * corresponding to the fcp operation. The LLDD shall send + * ABTS and follow FC exchange abort-multi rules, including + * ABTS retries and possible logout. + * Upon completing the indicated operation, the LLDD is to set the + * status fields for the operation (tranferred_length and fcp_error + * status) in the request, then all the "done" routine + * indicated in the fcp request. Upon return from the "done" + * routine for either a NVMET_FCOP_RSP or NVMET_FCOP_ABORT operation + * the fc-nvme layer will not longer reference the fcp request, + * allowing the LLDD to free/release the fcp request. + * Note: when calling the done routine for READDATA or WRITEDATA + * operations, the fc-nvme layer may immediate convert, in the same + * thread and before returning to the LLDD, the fcp operation to + * the next operation for the fcp io and call the LLDDs fcp_op + * call again. If fields in the fcp request are to be accessed post + * the done call, the LLDD should save their values prior to calling + * the done routine, and inspect the save values after the done + * routine. + * Returns 0 on success, -<errno> on failure (Ex: -EIO) + * Entrypoint is Mandatory. + * + * @max_hw_queues: indicates the maximum number of hw queues the LLDD + * supports for cpu affinitization. + * Value is Mandatory. Must be at least 1. + * + * @max_sgl_segments: indicates the maximum number of sgl segments supported + * by the LLDD + * Value is Mandatory. Must be at least 1. Recommend at least 256. + * + * @max_dif_sgl_segments: indicates the maximum number of sgl segments + * supported by the LLDD for DIF operations. + * Value is Mandatory. Must be at least 1. Recommend at least 256. + * + * @dma_boundary: indicates the dma address boundary where dma mappings + * will be split across. + * Value is Mandatory. Typical value is 0xFFFFFFFF to split across + * 4Gig address boundarys + * + * @target_features: The LLDD sets bits in this field to correspond to + * optional features that are supported by the LLDD. + * Refer to the NVMET_FCTGTFEAT_xxx values. + * Value is Mandatory. Allowed to be zero. + * + * @target_priv_sz: The LLDD sets this field to the amount of additional + * memory that it would like fc nvme layer to allocate on the LLDD's + * behalf whenever a targetport is allocated. The additional memory + * area solely for the of the LLDD and its location is specified by + * the targetport->private pointer. + * Value is Mandatory. Allowed to be zero. + */ +struct nvmet_fc_target_template { + void (*targetport_delete)(struct nvmet_fc_target_port *tgtport); + int (*xmt_ls_rsp)(struct nvmet_fc_target_port *tgtport, + struct nvmefc_tgt_ls_req *tls_req); + int (*fcp_op)(struct nvmet_fc_target_port *tgtport, + struct nvmefc_tgt_fcp_req *); + + u32 max_hw_queues; + u16 max_sgl_segments; + u16 max_dif_sgl_segments; + u64 dma_boundary; + + u32 target_features; + + u32 target_priv_sz; +}; + + +int nvmet_fc_register_targetport(struct nvmet_fc_port_info *portinfo, + struct nvmet_fc_target_template *template, + struct device *dev, + struct nvmet_fc_target_port **tgtport_p); + +int nvmet_fc_unregister_targetport(struct nvmet_fc_target_port *tgtport); + +int nvmet_fc_rcv_ls_req(struct nvmet_fc_target_port *tgtport, + struct nvmefc_tgt_ls_req *lsreq, + void *lsreqbuf, u32 lsreqbuf_len); + +int nvmet_fc_rcv_fcp_req(struct nvmet_fc_target_port *tgtport, + struct nvmefc_tgt_fcp_req *fcpreq, + void *cmdiubuf, u32 cmdiubuf_len); + +#endif /* _NVME_FC_DRIVER_H */ diff --git a/include/linux/nvme-fc.h b/include/linux/nvme-fc.h new file mode 100644 index 000000000000..4b45226bd604 --- /dev/null +++ b/include/linux/nvme-fc.h @@ -0,0 +1,268 @@ +/* + * Copyright (c) 2016 Avago Technologies. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful. + * ALL EXPRESS OR IMPLIED CONDITIONS, REPRESENTATIONS AND WARRANTIES, + * INCLUDING ANY IMPLIED WARRANTY OF MERCHANTABILITY, FITNESS FOR A + * PARTICULAR PURPOSE, OR NON-INFRINGEMENT, ARE DISCLAIMED, EXCEPT TO + * THE EXTENT THAT SUCH DISCLAIMERS ARE HELD TO BE LEGALLY INVALID. + * See the GNU General Public License for more details, a copy of which + * can be found in the file COPYING included with this package + * + */ + +/* + * This file contains definitions relative to FC-NVME r1.11 and a few + * newer items + */ + +#ifndef _NVME_FC_H +#define _NVME_FC_H 1 + + +#define NVME_CMD_SCSI_ID 0xFD +#define NVME_CMD_FC_ID FC_TYPE_NVME + +/* FC-NVME Cmd IU Flags */ +#define FCNVME_CMD_FLAGS_DIRMASK 0x03 +#define FCNVME_CMD_FLAGS_WRITE 0x01 +#define FCNVME_CMD_FLAGS_READ 0x02 + +struct nvme_fc_cmd_iu { + __u8 scsi_id; + __u8 fc_id; + __be16 iu_len; + __u8 rsvd4[3]; + __u8 flags; + __be64 connection_id; + __be32 csn; + __be32 data_len; + struct nvme_command sqe; + __be32 rsvd88[2]; +}; + +#define NVME_FC_SIZEOF_ZEROS_RSP 12 + +struct nvme_fc_ersp_iu { + __u8 rsvd0[2]; + __be16 iu_len; + __be32 rsn; + __be32 xfrd_len; + __be32 rsvd12; + struct nvme_completion cqe; + /* for now - no additional payload */ +}; + + +/* FC-NVME r1.03/16-119v0 NVME Link Services */ +enum { + FCNVME_LS_RSVD = 0, + FCNVME_LS_RJT = 1, + FCNVME_LS_ACC = 2, + FCNVME_LS_CREATE_ASSOCIATION = 3, + FCNVME_LS_CREATE_CONNECTION = 4, + FCNVME_LS_DISCONNECT = 5, +}; + +/* FC-NVME r1.03/16-119v0 NVME Link Service Descriptors */ +enum { + FCNVME_LSDESC_RSVD = 0x0, + FCNVME_LSDESC_RQST = 0x1, + FCNVME_LSDESC_RJT = 0x2, + FCNVME_LSDESC_CREATE_ASSOC_CMD = 0x3, + FCNVME_LSDESC_CREATE_CONN_CMD = 0x4, + FCNVME_LSDESC_DISCONN_CMD = 0x5, + FCNVME_LSDESC_CONN_ID = 0x6, + FCNVME_LSDESC_ASSOC_ID = 0x7, +}; + + +/* ********** start of Link Service Descriptors ********** */ + + +/* + * fills in length of a descriptor. Struture minus descriptor header + */ +static inline __be32 fcnvme_lsdesc_len(size_t sz) +{ + return cpu_to_be32(sz - (2 * sizeof(u32))); +} + + +struct fcnvme_ls_rqst_w0 { + u8 ls_cmd; /* FCNVME_LS_xxx */ + u8 zeros[3]; +}; + +/* FCNVME_LSDESC_RQST */ +struct fcnvme_lsdesc_rqst { + __be32 desc_tag; /* FCNVME_LSDESC_xxx */ + __be32 desc_len; + struct fcnvme_ls_rqst_w0 w0; + __be32 rsvd12; +}; + + + + +/* FCNVME_LSDESC_RJT */ +struct fcnvme_lsdesc_rjt { + __be32 desc_tag; /* FCNVME_LSDESC_xxx */ + __be32 desc_len; + u8 rsvd8; + + /* + * Reject reason and explanaction codes are generic + * to ELs's from LS-3. + */ + u8 reason_code; + u8 reason_explanation; + + u8 vendor; + __be32 rsvd12; +}; + + +#define FCNVME_ASSOC_HOSTID_LEN 64 +#define FCNVME_ASSOC_HOSTNQN_LEN 256 +#define FCNVME_ASSOC_SUBNQN_LEN 256 + +/* FCNVME_LSDESC_CREATE_ASSOC_CMD */ +struct fcnvme_lsdesc_cr_assoc_cmd { + __be32 desc_tag; /* FCNVME_LSDESC_xxx */ + __be32 desc_len; + __be16 ersp_ratio; + __be16 rsvd10; + __be32 rsvd12[9]; + __be16 cntlid; + __be16 sqsize; + __be32 rsvd52; + u8 hostid[FCNVME_ASSOC_HOSTID_LEN]; + u8 hostnqn[FCNVME_ASSOC_HOSTNQN_LEN]; + u8 subnqn[FCNVME_ASSOC_SUBNQN_LEN]; + u8 rsvd632[384]; +}; + +/* FCNVME_LSDESC_CREATE_CONN_CMD */ +struct fcnvme_lsdesc_cr_conn_cmd { + __be32 desc_tag; /* FCNVME_LSDESC_xxx */ + __be32 desc_len; + __be16 ersp_ratio; + __be16 rsvd10; + __be32 rsvd12[9]; + __be16 qid; + __be16 sqsize; + __be32 rsvd52; +}; + +/* Disconnect Scope Values */ +enum { + FCNVME_DISCONN_ASSOCIATION = 0, + FCNVME_DISCONN_CONNECTION = 1, +}; + +/* FCNVME_LSDESC_DISCONN_CMD */ +struct fcnvme_lsdesc_disconn_cmd { + __be32 desc_tag; /* FCNVME_LSDESC_xxx */ + __be32 desc_len; + u8 rsvd8[3]; + /* note: scope is really a 1 bit field */ + u8 scope; /* FCNVME_DISCONN_xxx */ + __be32 rsvd12; + __be64 id; +}; + +/* FCNVME_LSDESC_CONN_ID */ +struct fcnvme_lsdesc_conn_id { + __be32 desc_tag; /* FCNVME_LSDESC_xxx */ + __be32 desc_len; + __be64 connection_id; +}; + +/* FCNVME_LSDESC_ASSOC_ID */ +struct fcnvme_lsdesc_assoc_id { + __be32 desc_tag; /* FCNVME_LSDESC_xxx */ + __be32 desc_len; + __be64 association_id; +}; + +/* r_ctl values */ +enum { + FCNVME_RS_RCTL_DATA = 1, + FCNVME_RS_RCTL_XFER_RDY = 5, + FCNVME_RS_RCTL_RSP = 8, +}; + + +/* ********** start of Link Services ********** */ + + +/* FCNVME_LS_RJT */ +struct fcnvme_ls_rjt { + struct fcnvme_ls_rqst_w0 w0; + __be32 desc_list_len; + struct fcnvme_lsdesc_rqst rqst; + struct fcnvme_lsdesc_rjt rjt; +}; + +/* FCNVME_LS_ACC */ +struct fcnvme_ls_acc_hdr { + struct fcnvme_ls_rqst_w0 w0; + __be32 desc_list_len; + struct fcnvme_lsdesc_rqst rqst; + /* Followed by cmd-specific ACC descriptors, see next definitions */ +}; + +/* FCNVME_LS_CREATE_ASSOCIATION */ +struct fcnvme_ls_cr_assoc_rqst { + struct fcnvme_ls_rqst_w0 w0; + __be32 desc_list_len; + struct fcnvme_lsdesc_cr_assoc_cmd assoc_cmd; +}; + +struct fcnvme_ls_cr_assoc_acc { + struct fcnvme_ls_acc_hdr hdr; + struct fcnvme_lsdesc_assoc_id associd; + struct fcnvme_lsdesc_conn_id connectid; +}; + + +/* FCNVME_LS_CREATE_CONNECTION */ +struct fcnvme_ls_cr_conn_rqst { + struct fcnvme_ls_rqst_w0 w0; + __be32 desc_list_len; + struct fcnvme_lsdesc_assoc_id associd; + struct fcnvme_lsdesc_cr_conn_cmd connect_cmd; +}; + +struct fcnvme_ls_cr_conn_acc { + struct fcnvme_ls_acc_hdr hdr; + struct fcnvme_lsdesc_conn_id connectid; +}; + +/* FCNVME_LS_DISCONNECT */ +struct fcnvme_ls_disconnect_rqst { + struct fcnvme_ls_rqst_w0 w0; + __be32 desc_list_len; + struct fcnvme_lsdesc_assoc_id associd; + struct fcnvme_lsdesc_disconn_cmd discon_cmd; +}; + +struct fcnvme_ls_disconnect_acc { + struct fcnvme_ls_acc_hdr hdr; +}; + + +/* + * Yet to be defined in FC-NVME: + */ +#define NVME_FC_CONNECT_TIMEOUT_SEC 2 /* 2 seconds */ +#define NVME_FC_LS_TIMEOUT_SEC 2 /* 2 seconds */ +#define NVME_FC_TGTOP_TIMEOUT_SEC 2 /* 2 seconds */ + + +#endif /* _NVME_FC_H */ diff --git a/include/linux/nvme.h b/include/linux/nvme.h index fc3c24206593..3d1c6f1b15c9 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -242,6 +242,7 @@ enum { NVME_CTRL_ONCS_COMPARE = 1 << 0, NVME_CTRL_ONCS_WRITE_UNCORRECTABLE = 1 << 1, NVME_CTRL_ONCS_DSM = 1 << 2, + NVME_CTRL_ONCS_WRITE_ZEROES = 1 << 3, NVME_CTRL_VWC_PRESENT = 1 << 0, }; @@ -558,6 +559,23 @@ struct nvme_dsm_range { __le64 slba; }; +struct nvme_write_zeroes_cmd { + __u8 opcode; + __u8 flags; + __u16 command_id; + __le32 nsid; + __u64 rsvd2; + __le64 metadata; + union nvme_data_ptr dptr; + __le64 slba; + __le16 length; + __le16 control; + __le32 dsmgmt; + __le32 reftag; + __le16 apptag; + __le16 appmask; +}; + /* Admin commands */ enum nvme_admin_opcode { @@ -857,6 +875,7 @@ struct nvme_command { struct nvme_download_firmware dlfw; struct nvme_format_cmd format; struct nvme_dsm_cmd dsm; + struct nvme_write_zeroes_cmd write_zeroes; struct nvme_abort_cmd abort; struct nvme_get_log_page_command get_log_page; struct nvmf_common_command fabrics; @@ -947,6 +966,7 @@ enum { NVME_SC_BAD_ATTRIBUTES = 0x180, NVME_SC_INVALID_PI = 0x181, NVME_SC_READ_ONLY = 0x182, + NVME_SC_ONCS_NOT_SUPPORTED = 0x183, /* * I/O Command Set Specific - Fabrics commands: @@ -973,17 +993,30 @@ enum { NVME_SC_UNWRITTEN_BLOCK = 0x287, NVME_SC_DNR = 0x4000, + + + /* + * FC Transport-specific error status values for NVME commands + * + * Transport-specific status code values must be in the range 0xB0..0xBF + */ + + /* Generic FC failure - catchall */ + NVME_SC_FC_TRANSPORT_ERROR = 0x00B0, + + /* I/O failure due to FC ABTS'd */ + NVME_SC_FC_TRANSPORT_ABORTED = 0x00B1, }; struct nvme_completion { /* * Used by Admin and Fabrics commands to return data: */ - union { - __le16 result16; - __le32 result; - __le64 result64; - }; + union nvme_result { + __le16 u16; + __le32 u32; + __le64 u64; + } result; __le16 sq_head; /* how much of this queue may be reclaimed */ __le16 sq_id; /* submission queue that generated this entry */ __u16 command_id; /* of the command which completed */ diff --git a/include/linux/parser.h b/include/linux/parser.h index 39d5b7955b23..884c1e6eb3fe 100644 --- a/include/linux/parser.h +++ b/include/linux/parser.h @@ -27,6 +27,7 @@ typedef struct { int match_token(char *, const match_table_t table, substring_t args[]); int match_int(substring_t *, int *result); +int match_u64(substring_t *, u64 *result); int match_octal(substring_t *, int *result); int match_hex(substring_t *, int *result); bool match_wildcard(const char *pattern, const char *str); diff --git a/include/linux/swap.h b/include/linux/swap.h index 09b212d37f1d..09f4be179ff3 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -319,6 +319,9 @@ extern int kswapd_run(int nid); extern void kswapd_stop(int nid); #ifdef CONFIG_SWAP + +#include <linux/blk_types.h> /* for bio_end_io_t */ + /* linux/mm/page_io.c */ extern int swap_readpage(struct page *); extern int swap_writepage(struct page *page, struct writeback_control *wbc); diff --git a/include/linux/uio.h b/include/linux/uio.h index 6e22b544d039..d5aba1512b8b 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -125,7 +125,7 @@ static inline bool iter_is_iovec(const struct iov_iter *i) * * The ?: is just for type safety. */ -#define iov_iter_rw(i) ((0 ? (struct iov_iter *)0 : (i))->type & RW_MASK) +#define iov_iter_rw(i) ((0 ? (struct iov_iter *)0 : (i))->type & (READ | WRITE)) /* * Cap the iov_iter by given limit; note that the second argument is diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 797100e10010..c78f9f0920b5 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -9,6 +9,9 @@ #include <linux/fs.h> #include <linux/flex_proportions.h> #include <linux/backing-dev-defs.h> +#include <linux/blk_types.h> + +struct bio; DECLARE_PER_CPU(int, dirty_throttle_leaks); @@ -100,6 +103,16 @@ struct writeback_control { #endif }; +static inline int wbc_to_write_flags(struct writeback_control *wbc) +{ + if (wbc->sync_mode == WB_SYNC_ALL) + return REQ_SYNC; + else if (wbc->for_kupdate || wbc->for_background) + return REQ_BACKGROUND; + + return 0; +} + /* * A wb_domain represents a domain that wb's (bdi_writeback's) belong to * and are measured against each other in. There always is one global diff --git a/include/scsi/scsi_device.h b/include/scsi/scsi_device.h index 8a9563144890..8990e580b278 100644 --- a/include/scsi/scsi_device.h +++ b/include/scsi/scsi_device.h @@ -414,14 +414,14 @@ extern int scsi_execute(struct scsi_device *sdev, const unsigned char *cmd, extern int scsi_execute_req_flags(struct scsi_device *sdev, const unsigned char *cmd, int data_direction, void *buffer, unsigned bufflen, struct scsi_sense_hdr *sshdr, int timeout, - int retries, int *resid, u64 flags); + int retries, int *resid, u64 flags, req_flags_t rq_flags); static inline int scsi_execute_req(struct scsi_device *sdev, const unsigned char *cmd, int data_direction, void *buffer, unsigned bufflen, struct scsi_sense_hdr *sshdr, int timeout, int retries, int *resid) { return scsi_execute_req_flags(sdev, cmd, data_direction, buffer, - bufflen, sshdr, timeout, retries, resid, 0); + bufflen, sshdr, timeout, retries, resid, 0, 0); } extern void sdev_disable_disk_events(struct scsi_device *sdev); extern void sdev_enable_disk_events(struct scsi_device *sdev); diff --git a/include/scsi/scsi_proto.h b/include/scsi/scsi_proto.h index d1defd1ebd95..6ba66e01f6df 100644 --- a/include/scsi/scsi_proto.h +++ b/include/scsi/scsi_proto.h @@ -299,4 +299,21 @@ struct scsi_lun { #define SCSI_ACCESS_STATE_MASK 0x0f #define SCSI_ACCESS_STATE_PREFERRED 0x80 +/* Reporting options for REPORT ZONES */ +enum zbc_zone_reporting_options { + ZBC_ZONE_REPORTING_OPTION_ALL = 0, + ZBC_ZONE_REPORTING_OPTION_EMPTY, + ZBC_ZONE_REPORTING_OPTION_IMPLICIT_OPEN, + ZBC_ZONE_REPORTING_OPTION_EXPLICIT_OPEN, + ZBC_ZONE_REPORTING_OPTION_CLOSED, + ZBC_ZONE_REPORTING_OPTION_FULL, + ZBC_ZONE_REPORTING_OPTION_READONLY, + ZBC_ZONE_REPORTING_OPTION_OFFLINE, + ZBC_ZONE_REPORTING_OPTION_NEED_RESET_WP = 0x10, + ZBC_ZONE_REPORTING_OPTION_NON_SEQWRITE, + ZBC_ZONE_REPORTING_OPTION_NON_WP = 0x3f, +}; + +#define ZBC_REPORT_ZONE_PARTIAL 0x80 + #endif /* _SCSI_PROTO_H_ */ diff --git a/include/trace/events/bcache.h b/include/trace/events/bcache.h index d336b890e31f..df3e9ae5ad8d 100644 --- a/include/trace/events/bcache.h +++ b/include/trace/events/bcache.h @@ -27,8 +27,7 @@ DECLARE_EVENT_CLASS(bcache_request, __entry->sector = bio->bi_iter.bi_sector; __entry->orig_sector = bio->bi_iter.bi_sector - 16; __entry->nr_sector = bio->bi_iter.bi_size >> 9; - blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_opf, - bio->bi_iter.bi_size); + blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size); ), TP_printk("%d,%d %s %llu + %u (from %d,%d @ %llu)", @@ -102,8 +101,7 @@ DECLARE_EVENT_CLASS(bcache_bio, __entry->dev = bio->bi_bdev->bd_dev; __entry->sector = bio->bi_iter.bi_sector; __entry->nr_sector = bio->bi_iter.bi_size >> 9; - blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_opf, - bio->bi_iter.bi_size); + blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size); ), TP_printk("%d,%d %s %llu + %u", @@ -138,8 +136,7 @@ TRACE_EVENT(bcache_read, __entry->dev = bio->bi_bdev->bd_dev; __entry->sector = bio->bi_iter.bi_sector; __entry->nr_sector = bio->bi_iter.bi_size >> 9; - blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_opf, - bio->bi_iter.bi_size); + blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size); __entry->cache_hit = hit; __entry->bypass = bypass; ), @@ -170,8 +167,7 @@ TRACE_EVENT(bcache_write, __entry->inode = inode; __entry->sector = bio->bi_iter.bi_sector; __entry->nr_sector = bio->bi_iter.bi_size >> 9; - blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_opf, - bio->bi_iter.bi_size); + blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size); __entry->writeback = writeback; __entry->bypass = bypass; ), diff --git a/include/trace/events/block.h b/include/trace/events/block.h index 8f3a163b8166..3e02e3a25413 100644 --- a/include/trace/events/block.h +++ b/include/trace/events/block.h @@ -84,8 +84,7 @@ DECLARE_EVENT_CLASS(block_rq_with_error, 0 : blk_rq_sectors(rq); __entry->errors = rq->errors; - blk_fill_rwbs(__entry->rwbs, req_op(rq), rq->cmd_flags, - blk_rq_bytes(rq)); + blk_fill_rwbs(__entry->rwbs, rq->cmd_flags, blk_rq_bytes(rq)); blk_dump_cmd(__get_str(cmd), rq); ), @@ -163,7 +162,7 @@ TRACE_EVENT(block_rq_complete, __entry->nr_sector = nr_bytes >> 9; __entry->errors = rq->errors; - blk_fill_rwbs(__entry->rwbs, req_op(rq), rq->cmd_flags, nr_bytes); + blk_fill_rwbs(__entry->rwbs, rq->cmd_flags, nr_bytes); blk_dump_cmd(__get_str(cmd), rq); ), @@ -199,8 +198,7 @@ DECLARE_EVENT_CLASS(block_rq, __entry->bytes = (rq->cmd_type == REQ_TYPE_BLOCK_PC) ? blk_rq_bytes(rq) : 0; - blk_fill_rwbs(__entry->rwbs, req_op(rq), rq->cmd_flags, - blk_rq_bytes(rq)); + blk_fill_rwbs(__entry->rwbs, rq->cmd_flags, blk_rq_bytes(rq)); blk_dump_cmd(__get_str(cmd), rq); memcpy(__entry->comm, current->comm, TASK_COMM_LEN); ), @@ -274,8 +272,7 @@ TRACE_EVENT(block_bio_bounce, bio->bi_bdev->bd_dev : 0; __entry->sector = bio->bi_iter.bi_sector; __entry->nr_sector = bio_sectors(bio); - blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_opf, - bio->bi_iter.bi_size); + blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size); memcpy(__entry->comm, current->comm, TASK_COMM_LEN); ), @@ -313,8 +310,7 @@ TRACE_EVENT(block_bio_complete, __entry->sector = bio->bi_iter.bi_sector; __entry->nr_sector = bio_sectors(bio); __entry->error = error; - blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_opf, - bio->bi_iter.bi_size); + blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size); ), TP_printk("%d,%d %s %llu + %u [%d]", @@ -341,8 +337,7 @@ DECLARE_EVENT_CLASS(block_bio_merge, __entry->dev = bio->bi_bdev->bd_dev; __entry->sector = bio->bi_iter.bi_sector; __entry->nr_sector = bio_sectors(bio); - blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_opf, - bio->bi_iter.bi_size); + blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size); memcpy(__entry->comm, current->comm, TASK_COMM_LEN); ), @@ -409,8 +404,7 @@ TRACE_EVENT(block_bio_queue, __entry->dev = bio->bi_bdev->bd_dev; __entry->sector = bio->bi_iter.bi_sector; __entry->nr_sector = bio_sectors(bio); - blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_opf, - bio->bi_iter.bi_size); + blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size); memcpy(__entry->comm, current->comm, TASK_COMM_LEN); ), @@ -438,7 +432,7 @@ DECLARE_EVENT_CLASS(block_get_rq, __entry->dev = bio ? bio->bi_bdev->bd_dev : 0; __entry->sector = bio ? bio->bi_iter.bi_sector : 0; __entry->nr_sector = bio ? bio_sectors(bio) : 0; - blk_fill_rwbs(__entry->rwbs, bio ? bio_op(bio) : 0, + blk_fill_rwbs(__entry->rwbs, bio ? bio->bi_opf : 0, __entry->nr_sector); memcpy(__entry->comm, current->comm, TASK_COMM_LEN); ), @@ -573,8 +567,7 @@ TRACE_EVENT(block_split, __entry->dev = bio->bi_bdev->bd_dev; __entry->sector = bio->bi_iter.bi_sector; __entry->new_sector = new_sector; - blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_opf, - bio->bi_iter.bi_size); + blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size); memcpy(__entry->comm, current->comm, TASK_COMM_LEN); ), @@ -617,8 +610,7 @@ TRACE_EVENT(block_bio_remap, __entry->nr_sector = bio_sectors(bio); __entry->old_dev = dev; __entry->old_sector = from; - blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_opf, - bio->bi_iter.bi_size); + blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size); ), TP_printk("%d,%d %s %llu + %u <- (%d,%d) %llu", @@ -664,8 +656,7 @@ TRACE_EVENT(block_rq_remap, __entry->old_dev = dev; __entry->old_sector = from; __entry->nr_bios = blk_rq_count_bios(rq); - blk_fill_rwbs(__entry->rwbs, req_op(rq), rq->cmd_flags, - blk_rq_bytes(rq)); + blk_fill_rwbs(__entry->rwbs, rq->cmd_flags, blk_rq_bytes(rq)); ), TP_printk("%d,%d %s %llu + %u <- (%d,%d) %llu %u", diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index 903a09165bb1..5da2c829a718 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -32,7 +32,7 @@ TRACE_DEFINE_ENUM(LFS); TRACE_DEFINE_ENUM(SSR); TRACE_DEFINE_ENUM(__REQ_RAHEAD); TRACE_DEFINE_ENUM(__REQ_SYNC); -TRACE_DEFINE_ENUM(__REQ_NOIDLE); +TRACE_DEFINE_ENUM(__REQ_IDLE); TRACE_DEFINE_ENUM(__REQ_PREFLUSH); TRACE_DEFINE_ENUM(__REQ_FUA); TRACE_DEFINE_ENUM(__REQ_PRIO); @@ -55,7 +55,7 @@ TRACE_DEFINE_ENUM(CP_DISCARD); { IPU, "IN-PLACE" }, \ { OPU, "OUT-OF-PLACE" }) -#define F2FS_BIO_FLAG_MASK(t) (t & (REQ_RAHEAD | WRITE_FLUSH_FUA)) +#define F2FS_BIO_FLAG_MASK(t) (t & (REQ_RAHEAD | REQ_PREFLUSH | REQ_FUA)) #define F2FS_BIO_EXTRA_MASK(t) (t & (REQ_META | REQ_PRIO)) #define show_bio_type(op_flags) show_bio_op_flags(op_flags), \ @@ -65,11 +65,9 @@ TRACE_DEFINE_ENUM(CP_DISCARD); __print_symbolic(F2FS_BIO_FLAG_MASK(flags), \ { 0, "WRITE" }, \ { REQ_RAHEAD, "READAHEAD" }, \ - { READ_SYNC, "READ_SYNC" }, \ - { WRITE_SYNC, "WRITE_SYNC" }, \ - { WRITE_FLUSH, "WRITE_FLUSH" }, \ - { WRITE_FUA, "WRITE_FUA" }, \ - { WRITE_FLUSH_FUA, "WRITE_FLUSH_FUA" }) + { REQ_SYNC, "REQ_SYNC" }, \ + { REQ_PREFLUSH, "REQ_PREFLUSH" }, \ + { REQ_FUA, "REQ_FUA" }) #define show_bio_extra(type) \ __print_symbolic(F2FS_BIO_EXTRA_MASK(type), \ diff --git a/include/trace/events/wbt.h b/include/trace/events/wbt.h new file mode 100644 index 000000000000..3c518e455680 --- /dev/null +++ b/include/trace/events/wbt.h @@ -0,0 +1,153 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM wbt + +#if !defined(_TRACE_WBT_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_WBT_H + +#include <linux/tracepoint.h> +#include "../../../block/blk-wbt.h" + +/** + * wbt_stat - trace stats for blk_wb + * @stat: array of read/write stats + */ +TRACE_EVENT(wbt_stat, + + TP_PROTO(struct backing_dev_info *bdi, struct blk_rq_stat *stat), + + TP_ARGS(bdi, stat), + + TP_STRUCT__entry( + __array(char, name, 32) + __field(s64, rmean) + __field(u64, rmin) + __field(u64, rmax) + __field(s64, rnr_samples) + __field(s64, rtime) + __field(s64, wmean) + __field(u64, wmin) + __field(u64, wmax) + __field(s64, wnr_samples) + __field(s64, wtime) + ), + + TP_fast_assign( + strncpy(__entry->name, dev_name(bdi->dev), 32); + __entry->rmean = stat[0].mean; + __entry->rmin = stat[0].min; + __entry->rmax = stat[0].max; + __entry->rnr_samples = stat[0].nr_samples; + __entry->wmean = stat[1].mean; + __entry->wmin = stat[1].min; + __entry->wmax = stat[1].max; + __entry->wnr_samples = stat[1].nr_samples; + ), + + TP_printk("%s: rmean=%llu, rmin=%llu, rmax=%llu, rsamples=%llu, " + "wmean=%llu, wmin=%llu, wmax=%llu, wsamples=%llu\n", + __entry->name, __entry->rmean, __entry->rmin, __entry->rmax, + __entry->rnr_samples, __entry->wmean, __entry->wmin, + __entry->wmax, __entry->wnr_samples) +); + +/** + * wbt_lat - trace latency event + * @lat: latency trigger + */ +TRACE_EVENT(wbt_lat, + + TP_PROTO(struct backing_dev_info *bdi, unsigned long lat), + + TP_ARGS(bdi, lat), + + TP_STRUCT__entry( + __array(char, name, 32) + __field(unsigned long, lat) + ), + + TP_fast_assign( + strncpy(__entry->name, dev_name(bdi->dev), 32); + __entry->lat = div_u64(lat, 1000); + ), + + TP_printk("%s: latency %lluus\n", __entry->name, + (unsigned long long) __entry->lat) +); + +/** + * wbt_step - trace wb event step + * @msg: context message + * @step: the current scale step count + * @window: the current monitoring window + * @bg: the current background queue limit + * @normal: the current normal writeback limit + * @max: the current max throughput writeback limit + */ +TRACE_EVENT(wbt_step, + + TP_PROTO(struct backing_dev_info *bdi, const char *msg, + int step, unsigned long window, unsigned int bg, + unsigned int normal, unsigned int max), + + TP_ARGS(bdi, msg, step, window, bg, normal, max), + + TP_STRUCT__entry( + __array(char, name, 32) + __field(const char *, msg) + __field(int, step) + __field(unsigned long, window) + __field(unsigned int, bg) + __field(unsigned int, normal) + __field(unsigned int, max) + ), + + TP_fast_assign( + strncpy(__entry->name, dev_name(bdi->dev), 32); + __entry->msg = msg; + __entry->step = step; + __entry->window = div_u64(window, 1000); + __entry->bg = bg; + __entry->normal = normal; + __entry->max = max; + ), + + TP_printk("%s: %s: step=%d, window=%luus, background=%u, normal=%u, max=%u\n", + __entry->name, __entry->msg, __entry->step, __entry->window, + __entry->bg, __entry->normal, __entry->max) +); + +/** + * wbt_timer - trace wb timer event + * @status: timer state status + * @step: the current scale step count + * @inflight: tracked writes inflight + */ +TRACE_EVENT(wbt_timer, + + TP_PROTO(struct backing_dev_info *bdi, unsigned int status, + int step, unsigned int inflight), + + TP_ARGS(bdi, status, step, inflight), + + TP_STRUCT__entry( + __array(char, name, 32) + __field(unsigned int, status) + __field(int, step) + __field(unsigned int, inflight) + ), + + TP_fast_assign( + strncpy(__entry->name, dev_name(bdi->dev), 32); + __entry->status = status; + __entry->step = step; + __entry->inflight = inflight; + ), + + TP_printk("%s: status=%u, step=%d, inflight=%u\n", __entry->name, + __entry->status, __entry->step, __entry->inflight) +); + +#endif /* _TRACE_WBT_H */ + +/* This part must be outside protection */ +#include <trace/define_trace.h> diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild index 9838a5c7b3e7..bc2ef9fef7c8 100644 --- a/include/uapi/linux/Kbuild +++ b/include/uapi/linux/Kbuild @@ -70,6 +70,7 @@ header-y += bfs_fs.h header-y += binfmts.h header-y += blkpg.h header-y += blktrace_api.h +header-y += blkzoned.h header-y += bpf_common.h header-y += bpf_perf_event.h header-y += bpf.h diff --git a/include/uapi/linux/blkzoned.h b/include/uapi/linux/blkzoned.h new file mode 100644 index 000000000000..40d1d7bff537 --- /dev/null +++ b/include/uapi/linux/blkzoned.h @@ -0,0 +1,143 @@ +/* + * Zoned block devices handling. + * + * Copyright (C) 2015 Seagate Technology PLC + * + * Written by: Shaun Tancheff <shaun.tancheff@seagate.com> + * + * Modified by: Damien Le Moal <damien.lemoal@hgst.com> + * Copyright (C) 2016 Western Digital + * + * This file is licensed under the terms of the GNU General Public + * License version 2. This program is licensed "as is" without any + * warranty of any kind, whether express or implied. + */ +#ifndef _UAPI_BLKZONED_H +#define _UAPI_BLKZONED_H + +#include <linux/types.h> +#include <linux/ioctl.h> + +/** + * enum blk_zone_type - Types of zones allowed in a zoned device. + * + * @BLK_ZONE_TYPE_CONVENTIONAL: The zone has no write pointer and can be writen + * randomly. Zone reset has no effect on the zone. + * @BLK_ZONE_TYPE_SEQWRITE_REQ: The zone must be written sequentially + * @BLK_ZONE_TYPE_SEQWRITE_PREF: The zone can be written non-sequentially + * + * Any other value not defined is reserved and must be considered as invalid. + */ +enum blk_zone_type { + BLK_ZONE_TYPE_CONVENTIONAL = 0x1, + BLK_ZONE_TYPE_SEQWRITE_REQ = 0x2, + BLK_ZONE_TYPE_SEQWRITE_PREF = 0x3, +}; + +/** + * enum blk_zone_cond - Condition [state] of a zone in a zoned device. + * + * @BLK_ZONE_COND_NOT_WP: The zone has no write pointer, it is conventional. + * @BLK_ZONE_COND_EMPTY: The zone is empty. + * @BLK_ZONE_COND_IMP_OPEN: The zone is open, but not explicitly opened. + * @BLK_ZONE_COND_EXP_OPEN: The zones was explicitly opened by an + * OPEN ZONE command. + * @BLK_ZONE_COND_CLOSED: The zone was [explicitly] closed after writing. + * @BLK_ZONE_COND_FULL: The zone is marked as full, possibly by a zone + * FINISH ZONE command. + * @BLK_ZONE_COND_READONLY: The zone is read-only. + * @BLK_ZONE_COND_OFFLINE: The zone is offline (sectors cannot be read/written). + * + * The Zone Condition state machine in the ZBC/ZAC standards maps the above + * deinitions as: + * - ZC1: Empty | BLK_ZONE_EMPTY + * - ZC2: Implicit Open | BLK_ZONE_COND_IMP_OPEN + * - ZC3: Explicit Open | BLK_ZONE_COND_EXP_OPEN + * - ZC4: Closed | BLK_ZONE_CLOSED + * - ZC5: Full | BLK_ZONE_FULL + * - ZC6: Read Only | BLK_ZONE_READONLY + * - ZC7: Offline | BLK_ZONE_OFFLINE + * + * Conditions 0x5 to 0xC are reserved by the current ZBC/ZAC spec and should + * be considered invalid. + */ +enum blk_zone_cond { + BLK_ZONE_COND_NOT_WP = 0x0, + BLK_ZONE_COND_EMPTY = 0x1, + BLK_ZONE_COND_IMP_OPEN = 0x2, + BLK_ZONE_COND_EXP_OPEN = 0x3, + BLK_ZONE_COND_CLOSED = 0x4, + BLK_ZONE_COND_READONLY = 0xD, + BLK_ZONE_COND_FULL = 0xE, + BLK_ZONE_COND_OFFLINE = 0xF, +}; + +/** + * struct blk_zone - Zone descriptor for BLKREPORTZONE ioctl. + * + * @start: Zone start in 512 B sector units + * @len: Zone length in 512 B sector units + * @wp: Zone write pointer location in 512 B sector units + * @type: see enum blk_zone_type for possible values + * @cond: see enum blk_zone_cond for possible values + * @non_seq: Flag indicating that the zone is using non-sequential resources + * (for host-aware zoned block devices only). + * @reset: Flag indicating that a zone reset is recommended. + * @reserved: Padding to 64 B to match the ZBC/ZAC defined zone descriptor size. + * + * start, len and wp use the regular 512 B sector unit, regardless of the + * device logical block size. The overall structure size is 64 B to match the + * ZBC/ZAC defined zone descriptor and allow support for future additional + * zone information. + */ +struct blk_zone { + __u64 start; /* Zone start sector */ + __u64 len; /* Zone length in number of sectors */ + __u64 wp; /* Zone write pointer position */ + __u8 type; /* Zone type */ + __u8 cond; /* Zone condition */ + __u8 non_seq; /* Non-sequential write resources active */ + __u8 reset; /* Reset write pointer recommended */ + __u8 reserved[36]; +}; + +/** + * struct blk_zone_report - BLKREPORTZONE ioctl request/reply + * + * @sector: starting sector of report + * @nr_zones: IN maximum / OUT actual + * @reserved: padding to 16 byte alignment + * @zones: Space to hold @nr_zones @zones entries on reply. + * + * The array of at most @nr_zones must follow this structure in memory. + */ +struct blk_zone_report { + __u64 sector; + __u32 nr_zones; + __u8 reserved[4]; + struct blk_zone zones[0]; +} __packed; + +/** + * struct blk_zone_range - BLKRESETZONE ioctl request + * @sector: starting sector of the first zone to issue reset write pointer + * @nr_sectors: Total number of sectors of 1 or more zones to reset + */ +struct blk_zone_range { + __u64 sector; + __u64 nr_sectors; +}; + +/** + * Zoned block device ioctl's: + * + * @BLKREPORTZONE: Get zone information. Takes a zone report as argument. + * The zone report will start from the zone containing the + * sector specified in the report request structure. + * @BLKRESETZONE: Reset the write pointer of the zones in the specified + * sector range. The sector range must be zone aligned. + */ +#define BLKREPORTZONE _IOWR(0x12, 130, struct blk_zone_report) +#define BLKRESETZONE _IOW(0x12, 131, struct blk_zone_range) + +#endif /* _UAPI_BLKZONED_H */ diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index acb2b6152ba0..c1d11df07b28 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -225,6 +225,10 @@ struct fsxattr { #define BLKSECDISCARD _IO(0x12,125) #define BLKROTATIONAL _IO(0x12,126) #define BLKZEROOUT _IO(0x12,127) +/* + * A jump here: 130-131 are reserved for zoned block devices + * (see uapi/linux/blkzoned.h) + */ #define BMAP_IOCTL 1 /* obsolete - kept for compatibility */ #define FIBMAP _IO(0x00,1) /* bmap access */ diff --git a/include/uapi/linux/nbd.h b/include/uapi/linux/nbd.h index e08e413d5f71..c91c642ea900 100644 --- a/include/uapi/linux/nbd.h +++ b/include/uapi/linux/nbd.h @@ -38,11 +38,12 @@ enum { }; /* values for flags field */ -#define NBD_FLAG_HAS_FLAGS (1 << 0) /* nbd-server supports flags */ -#define NBD_FLAG_READ_ONLY (1 << 1) /* device is read-only */ -#define NBD_FLAG_SEND_FLUSH (1 << 2) /* can flush writeback cache */ +#define NBD_FLAG_HAS_FLAGS (1 << 0) /* nbd-server supports flags */ +#define NBD_FLAG_READ_ONLY (1 << 1) /* device is read-only */ +#define NBD_FLAG_SEND_FLUSH (1 << 2) /* can flush writeback cache */ /* there is a gap here to match userspace */ -#define NBD_FLAG_SEND_TRIM (1 << 5) /* send trim/discard */ +#define NBD_FLAG_SEND_TRIM (1 << 5) /* send trim/discard */ +#define NBD_FLAG_CAN_MULTI_CONN (1 << 8) /* Server supports multiple connections per export. */ /* userspace doesn't need the nbd_device structure */ diff --git a/include/uapi/scsi/fc/fc_fs.h b/include/uapi/scsi/fc/fc_fs.h index 50f28b143451..dcf314dc2a27 100644 --- a/include/uapi/scsi/fc/fc_fs.h +++ b/include/uapi/scsi/fc/fc_fs.h @@ -190,6 +190,7 @@ enum fc_fh_type { FC_TYPE_FCP = 0x08, /* SCSI FCP */ FC_TYPE_CT = 0x20, /* Fibre Channel Services (FC-CT) */ FC_TYPE_ILS = 0x22, /* internal link service */ + FC_TYPE_NVME = 0x28, /* FC-NVME */ }; /* @@ -203,6 +204,7 @@ enum fc_fh_type { [FC_TYPE_FCP] = "FCP", \ [FC_TYPE_CT] = "CT", \ [FC_TYPE_ILS] = "ILS", \ + [FC_TYPE_NVME] = "NVME", \ } /* |