1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22#include <linux/kernel.h>
23#include <linux/module.h>
24#include <linux/types.h>
25#include <linux/fs.h>
26#include <linux/mm.h>
27#include <linux/slab.h>
28#include <linux/highmem.h>
29#include <linux/pagemap.h>
30#include <linux/task_io_accounting_ops.h>
31#include <linux/bio.h>
32#include <linux/wait.h>
33#include <linux/err.h>
34#include <linux/blkdev.h>
35#include <linux/buffer_head.h>
36#include <linux/rwsem.h>
37#include <linux/uio.h>
38#include <linux/atomic.h>
39#include <linux/prefetch.h>
40
41
42
43
44
45#define DIO_PAGES 64
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61struct dio_submit {
62 struct bio *bio;
63 unsigned blkbits;
64 unsigned blkfactor;
65
66
67
68
69 unsigned start_zero_done;
70
71
72 int pages_in_io;
73 sector_t block_in_file;
74
75 unsigned blocks_available;
76 int reap_counter;
77 sector_t final_block_in_request;
78 int boundary;
79 get_block_t *get_block;
80 dio_submit_t *submit_io;
81
82 loff_t logical_offset_in_bio;
83 sector_t final_block_in_bio;
84 sector_t next_block_for_io;
85
86
87
88
89
90
91
92 struct page *cur_page;
93 unsigned cur_page_offset;
94 unsigned cur_page_len;
95 sector_t cur_page_block;
96 loff_t cur_page_fs_offset;
97
98 struct iov_iter *iter;
99
100
101
102
103 unsigned head;
104 unsigned tail;
105 size_t from, to;
106};
107
108
109struct dio {
110 int flags;
111 int op;
112 int op_flags;
113 blk_qc_t bio_cookie;
114 struct block_device *bio_bdev;
115 struct inode *inode;
116 loff_t i_size;
117 dio_iodone_t *end_io;
118
119 void *private;
120
121
122 spinlock_t bio_lock;
123 int page_errors;
124 int is_async;
125 bool defer_completion;
126 bool should_dirty;
127 int io_error;
128 unsigned long refcount;
129 struct bio *bio_list;
130 struct task_struct *waiter;
131
132
133 struct kiocb *iocb;
134 ssize_t result;
135
136
137
138
139
140
141 union {
142 struct page *pages[DIO_PAGES];
143 struct work_struct complete_work;
144 };
145} ____cacheline_aligned_in_smp;
146
147static struct kmem_cache *dio_cache __read_mostly;
148
149
150
151
152static inline unsigned dio_pages_present(struct dio_submit *sdio)
153{
154 return sdio->tail - sdio->head;
155}
156
157
158
159
160static inline int dio_refill_pages(struct dio *dio, struct dio_submit *sdio)
161{
162 ssize_t ret;
163
164 ret = iov_iter_get_pages(sdio->iter, dio->pages, LONG_MAX, DIO_PAGES,
165 &sdio->from);
166
167 if (ret < 0 && sdio->blocks_available && (dio->op == REQ_OP_WRITE)) {
168 struct page *page = ZERO_PAGE(0);
169
170
171
172
173
174 if (dio->page_errors == 0)
175 dio->page_errors = ret;
176 get_page(page);
177 dio->pages[0] = page;
178 sdio->head = 0;
179 sdio->tail = 1;
180 sdio->from = 0;
181 sdio->to = PAGE_SIZE;
182 return 0;
183 }
184
185 if (ret >= 0) {
186 iov_iter_advance(sdio->iter, ret);
187 ret += sdio->from;
188 sdio->head = 0;
189 sdio->tail = (ret + PAGE_SIZE - 1) / PAGE_SIZE;
190 sdio->to = ((ret - 1) & (PAGE_SIZE - 1)) + 1;
191 return 0;
192 }
193 return ret;
194}
195
196
197
198
199
200
201
202static inline struct page *dio_get_page(struct dio *dio,
203 struct dio_submit *sdio)
204{
205 if (dio_pages_present(sdio) == 0) {
206 int ret;
207
208 ret = dio_refill_pages(dio, sdio);
209 if (ret)
210 return ERR_PTR(ret);
211 BUG_ON(dio_pages_present(sdio) == 0);
212 }
213 return dio->pages[sdio->head];
214}
215
216
217
218
219
220
221
222
223
224
225
226
227
228static ssize_t dio_complete(struct dio *dio, ssize_t ret, bool is_async)
229{
230 loff_t offset = dio->iocb->ki_pos;
231 ssize_t transferred = 0;
232
233
234
235
236
237
238
239 if (ret == -EIOCBQUEUED)
240 ret = 0;
241
242 if (dio->result) {
243 transferred = dio->result;
244
245
246 if ((dio->op == REQ_OP_READ) &&
247 ((offset + transferred) > dio->i_size))
248 transferred = dio->i_size - offset;
249
250 if (unlikely(ret == -EFAULT) && transferred)
251 ret = 0;
252 }
253
254 if (ret == 0)
255 ret = dio->page_errors;
256 if (ret == 0)
257 ret = dio->io_error;
258 if (ret == 0)
259 ret = transferred;
260
261 if (dio->end_io) {
262 int err;
263
264
265 err = dio->end_io(dio->iocb, offset, ret, dio->private);
266 if (err)
267 ret = err;
268 }
269
270 if (!(dio->flags & DIO_SKIP_DIO_COUNT))
271 inode_dio_end(dio->inode);
272
273 if (is_async) {
274
275
276
277
278
279 dio->iocb->ki_pos += transferred;
280
281 if (dio->op == REQ_OP_WRITE)
282 ret = generic_write_sync(dio->iocb, transferred);
283 dio->iocb->ki_complete(dio->iocb, ret, 0);
284 }
285
286 kmem_cache_free(dio_cache, dio);
287 return ret;
288}
289
290static void dio_aio_complete_work(struct work_struct *work)
291{
292 struct dio *dio = container_of(work, struct dio, complete_work);
293
294 dio_complete(dio, 0, true);
295}
296
297static int dio_bio_complete(struct dio *dio, struct bio *bio);
298
299
300
301
302static void dio_bio_end_aio(struct bio *bio)
303{
304 struct dio *dio = bio->bi_private;
305 unsigned long remaining;
306 unsigned long flags;
307
308
309 dio_bio_complete(dio, bio);
310
311 spin_lock_irqsave(&dio->bio_lock, flags);
312 remaining = --dio->refcount;
313 if (remaining == 1 && dio->waiter)
314 wake_up_process(dio->waiter);
315 spin_unlock_irqrestore(&dio->bio_lock, flags);
316
317 if (remaining == 0) {
318 if (dio->result && dio->defer_completion) {
319 INIT_WORK(&dio->complete_work, dio_aio_complete_work);
320 queue_work(dio->inode->i_sb->s_dio_done_wq,
321 &dio->complete_work);
322 } else {
323 dio_complete(dio, 0, true);
324 }
325 }
326}
327
328
329
330
331
332
333
334
335static void dio_bio_end_io(struct bio *bio)
336{
337 struct dio *dio = bio->bi_private;
338 unsigned long flags;
339
340 spin_lock_irqsave(&dio->bio_lock, flags);
341 bio->bi_private = dio->bio_list;
342 dio->bio_list = bio;
343 if (--dio->refcount == 1 && dio->waiter)
344 wake_up_process(dio->waiter);
345 spin_unlock_irqrestore(&dio->bio_lock, flags);
346}
347
348
349
350
351
352
353
354
355
356
357void dio_end_io(struct bio *bio, int error)
358{
359 struct dio *dio = bio->bi_private;
360
361 if (dio->is_async)
362 dio_bio_end_aio(bio);
363 else
364 dio_bio_end_io(bio);
365}
366EXPORT_SYMBOL_GPL(dio_end_io);
367
368static inline void
369dio_bio_alloc(struct dio *dio, struct dio_submit *sdio,
370 struct block_device *bdev,
371 sector_t first_sector, int nr_vecs)
372{
373 struct bio *bio;
374
375
376
377
378
379 bio = bio_alloc(GFP_KERNEL, nr_vecs);
380
381 bio->bi_bdev = bdev;
382 bio->bi_iter.bi_sector = first_sector;
383 bio_set_op_attrs(bio, dio->op, dio->op_flags);
384 if (dio->is_async)
385 bio->bi_end_io = dio_bio_end_aio;
386 else
387 bio->bi_end_io = dio_bio_end_io;
388
389 sdio->bio = bio;
390 sdio->logical_offset_in_bio = sdio->cur_page_fs_offset;
391}
392
393
394
395
396
397
398
399
400static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio)
401{
402 struct bio *bio = sdio->bio;
403 unsigned long flags;
404
405 bio->bi_private = dio;
406
407 spin_lock_irqsave(&dio->bio_lock, flags);
408 dio->refcount++;
409 spin_unlock_irqrestore(&dio->bio_lock, flags);
410
411 if (dio->is_async && dio->op == REQ_OP_READ && dio->should_dirty)
412 bio_set_pages_dirty(bio);
413
414 dio->bio_bdev = bio->bi_bdev;
415
416 if (sdio->submit_io) {
417 sdio->submit_io(bio, dio->inode, sdio->logical_offset_in_bio);
418 dio->bio_cookie = BLK_QC_T_NONE;
419 } else
420 dio->bio_cookie = submit_bio(bio);
421
422 sdio->bio = NULL;
423 sdio->boundary = 0;
424 sdio->logical_offset_in_bio = 0;
425}
426
427
428
429
430static inline void dio_cleanup(struct dio *dio, struct dio_submit *sdio)
431{
432 while (sdio->head < sdio->tail)
433 put_page(dio->pages[sdio->head++]);
434}
435
436
437
438
439
440
441
442static struct bio *dio_await_one(struct dio *dio)
443{
444 unsigned long flags;
445 struct bio *bio = NULL;
446
447 spin_lock_irqsave(&dio->bio_lock, flags);
448
449
450
451
452
453
454
455 while (dio->refcount > 1 && dio->bio_list == NULL) {
456 __set_current_state(TASK_UNINTERRUPTIBLE);
457 dio->waiter = current;
458 spin_unlock_irqrestore(&dio->bio_lock, flags);
459 if (!(dio->iocb->ki_flags & IOCB_HIPRI) ||
460 !blk_poll(bdev_get_queue(dio->bio_bdev), dio->bio_cookie))
461 io_schedule();
462
463 spin_lock_irqsave(&dio->bio_lock, flags);
464 dio->waiter = NULL;
465 }
466 if (dio->bio_list) {
467 bio = dio->bio_list;
468 dio->bio_list = bio->bi_private;
469 }
470 spin_unlock_irqrestore(&dio->bio_lock, flags);
471 return bio;
472}
473
474
475
476
477static int dio_bio_complete(struct dio *dio, struct bio *bio)
478{
479 struct bio_vec *bvec;
480 unsigned i;
481 int err;
482
483 if (bio->bi_error)
484 dio->io_error = -EIO;
485
486 if (dio->is_async && dio->op == REQ_OP_READ && dio->should_dirty) {
487 err = bio->bi_error;
488 bio_check_pages_dirty(bio);
489 } else {
490 bio_for_each_segment_all(bvec, bio, i) {
491 struct page *page = bvec->bv_page;
492
493 if (dio->op == REQ_OP_READ && !PageCompound(page) &&
494 dio->should_dirty)
495 set_page_dirty_lock(page);
496 put_page(page);
497 }
498 err = bio->bi_error;
499 bio_put(bio);
500 }
501 return err;
502}
503
504
505
506
507
508
509
510
511static void dio_await_completion(struct dio *dio)
512{
513 struct bio *bio;
514 do {
515 bio = dio_await_one(dio);
516 if (bio)
517 dio_bio_complete(dio, bio);
518 } while (bio);
519}
520
521
522
523
524
525
526
527
528static inline int dio_bio_reap(struct dio *dio, struct dio_submit *sdio)
529{
530 int ret = 0;
531
532 if (sdio->reap_counter++ >= 64) {
533 while (dio->bio_list) {
534 unsigned long flags;
535 struct bio *bio;
536 int ret2;
537
538 spin_lock_irqsave(&dio->bio_lock, flags);
539 bio = dio->bio_list;
540 dio->bio_list = bio->bi_private;
541 spin_unlock_irqrestore(&dio->bio_lock, flags);
542 ret2 = dio_bio_complete(dio, bio);
543 if (ret == 0)
544 ret = ret2;
545 }
546 sdio->reap_counter = 0;
547 }
548 return ret;
549}
550
551
552
553
554
555
556
557static int sb_init_dio_done_wq(struct super_block *sb)
558{
559 struct workqueue_struct *old;
560 struct workqueue_struct *wq = alloc_workqueue("dio/%s",
561 WQ_MEM_RECLAIM, 0,
562 sb->s_id);
563 if (!wq)
564 return -ENOMEM;
565
566
567
568 old = cmpxchg(&sb->s_dio_done_wq, NULL, wq);
569
570 if (old)
571 destroy_workqueue(wq);
572 return 0;
573}
574
575static int dio_set_defer_completion(struct dio *dio)
576{
577 struct super_block *sb = dio->inode->i_sb;
578
579 if (dio->defer_completion)
580 return 0;
581 dio->defer_completion = true;
582 if (!sb->s_dio_done_wq)
583 return sb_init_dio_done_wq(sb);
584 return 0;
585}
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610static int get_more_blocks(struct dio *dio, struct dio_submit *sdio,
611 struct buffer_head *map_bh)
612{
613 int ret;
614 sector_t fs_startblk;
615 sector_t fs_endblk;
616 unsigned long fs_count;
617 int create;
618 unsigned int i_blkbits = sdio->blkbits + sdio->blkfactor;
619
620
621
622
623
624 ret = dio->page_errors;
625 if (ret == 0) {
626 BUG_ON(sdio->block_in_file >= sdio->final_block_in_request);
627 fs_startblk = sdio->block_in_file >> sdio->blkfactor;
628 fs_endblk = (sdio->final_block_in_request - 1) >>
629 sdio->blkfactor;
630 fs_count = fs_endblk - fs_startblk + 1;
631
632 map_bh->b_state = 0;
633 map_bh->b_size = fs_count << i_blkbits;
634
635
636
637
638
639
640
641
642
643
644
645
646 create = dio->op == REQ_OP_WRITE;
647 if (dio->flags & DIO_SKIP_HOLES) {
648 if (fs_startblk <= ((i_size_read(dio->inode) - 1) >>
649 i_blkbits))
650 create = 0;
651 }
652
653 ret = (*sdio->get_block)(dio->inode, fs_startblk,
654 map_bh, create);
655
656
657 dio->private = map_bh->b_private;
658
659 if (ret == 0 && buffer_defer_completion(map_bh))
660 ret = dio_set_defer_completion(dio);
661 }
662 return ret;
663}
664
665
666
667
668static inline int dio_new_bio(struct dio *dio, struct dio_submit *sdio,
669 sector_t start_sector, struct buffer_head *map_bh)
670{
671 sector_t sector;
672 int ret, nr_pages;
673
674 ret = dio_bio_reap(dio, sdio);
675 if (ret)
676 goto out;
677 sector = start_sector << (sdio->blkbits - 9);
678 nr_pages = min(sdio->pages_in_io, BIO_MAX_PAGES);
679 BUG_ON(nr_pages <= 0);
680 dio_bio_alloc(dio, sdio, map_bh->b_bdev, sector, nr_pages);
681 sdio->boundary = 0;
682out:
683 return ret;
684}
685
686
687
688
689
690
691
692
693static inline int dio_bio_add_page(struct dio_submit *sdio)
694{
695 int ret;
696
697 ret = bio_add_page(sdio->bio, sdio->cur_page,
698 sdio->cur_page_len, sdio->cur_page_offset);
699 if (ret == sdio->cur_page_len) {
700
701
702
703 if ((sdio->cur_page_len + sdio->cur_page_offset) == PAGE_SIZE)
704 sdio->pages_in_io--;
705 get_page(sdio->cur_page);
706 sdio->final_block_in_bio = sdio->cur_page_block +
707 (sdio->cur_page_len >> sdio->blkbits);
708 ret = 0;
709 } else {
710 ret = 1;
711 }
712 return ret;
713}
714
715
716
717
718
719
720
721
722
723
724
725static inline int dio_send_cur_page(struct dio *dio, struct dio_submit *sdio,
726 struct buffer_head *map_bh)
727{
728 int ret = 0;
729
730 if (sdio->bio) {
731 loff_t cur_offset = sdio->cur_page_fs_offset;
732 loff_t bio_next_offset = sdio->logical_offset_in_bio +
733 sdio->bio->bi_iter.bi_size;
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749 if (sdio->final_block_in_bio != sdio->cur_page_block ||
750 cur_offset != bio_next_offset)
751 dio_bio_submit(dio, sdio);
752 }
753
754 if (sdio->bio == NULL) {
755 ret = dio_new_bio(dio, sdio, sdio->cur_page_block, map_bh);
756 if (ret)
757 goto out;
758 }
759
760 if (dio_bio_add_page(sdio) != 0) {
761 dio_bio_submit(dio, sdio);
762 ret = dio_new_bio(dio, sdio, sdio->cur_page_block, map_bh);
763 if (ret == 0) {
764 ret = dio_bio_add_page(sdio);
765 BUG_ON(ret != 0);
766 }
767 }
768out:
769 return ret;
770}
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789static inline int
790submit_page_section(struct dio *dio, struct dio_submit *sdio, struct page *page,
791 unsigned offset, unsigned len, sector_t blocknr,
792 struct buffer_head *map_bh)
793{
794 int ret = 0;
795
796 if (dio->op == REQ_OP_WRITE) {
797
798
799
800 task_io_account_write(len);
801 }
802
803
804
805
806 if (sdio->cur_page == page &&
807 sdio->cur_page_offset + sdio->cur_page_len == offset &&
808 sdio->cur_page_block +
809 (sdio->cur_page_len >> sdio->blkbits) == blocknr) {
810 sdio->cur_page_len += len;
811 goto out;
812 }
813
814
815
816
817 if (sdio->cur_page) {
818 ret = dio_send_cur_page(dio, sdio, map_bh);
819 put_page(sdio->cur_page);
820 sdio->cur_page = NULL;
821 if (ret)
822 return ret;
823 }
824
825 get_page(page);
826 sdio->cur_page = page;
827 sdio->cur_page_offset = offset;
828 sdio->cur_page_len = len;
829 sdio->cur_page_block = blocknr;
830 sdio->cur_page_fs_offset = sdio->block_in_file << sdio->blkbits;
831out:
832
833
834
835
836 if (sdio->boundary) {
837 ret = dio_send_cur_page(dio, sdio, map_bh);
838 dio_bio_submit(dio, sdio);
839 put_page(sdio->cur_page);
840 sdio->cur_page = NULL;
841 }
842 return ret;
843}
844
845
846
847
848
849
850static void clean_blockdev_aliases(struct dio *dio, struct buffer_head *map_bh)
851{
852 unsigned i;
853 unsigned nblocks;
854
855 nblocks = map_bh->b_size >> dio->inode->i_blkbits;
856
857 for (i = 0; i < nblocks; i++) {
858 unmap_underlying_metadata(map_bh->b_bdev,
859 map_bh->b_blocknr + i);
860 }
861}
862
863
864
865
866
867
868
869
870
871
872static inline void dio_zero_block(struct dio *dio, struct dio_submit *sdio,
873 int end, struct buffer_head *map_bh)
874{
875 unsigned dio_blocks_per_fs_block;
876 unsigned this_chunk_blocks;
877 unsigned this_chunk_bytes;
878 struct page *page;
879
880 sdio->start_zero_done = 1;
881 if (!sdio->blkfactor || !buffer_new(map_bh))
882 return;
883
884 dio_blocks_per_fs_block = 1 << sdio->blkfactor;
885 this_chunk_blocks = sdio->block_in_file & (dio_blocks_per_fs_block - 1);
886
887 if (!this_chunk_blocks)
888 return;
889
890
891
892
893
894 if (end)
895 this_chunk_blocks = dio_blocks_per_fs_block - this_chunk_blocks;
896
897 this_chunk_bytes = this_chunk_blocks << sdio->blkbits;
898
899 page = ZERO_PAGE(0);
900 if (submit_page_section(dio, sdio, page, 0, this_chunk_bytes,
901 sdio->next_block_for_io, map_bh))
902 return;
903
904 sdio->next_block_for_io += this_chunk_blocks;
905}
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923static int do_direct_IO(struct dio *dio, struct dio_submit *sdio,
924 struct buffer_head *map_bh)
925{
926 const unsigned blkbits = sdio->blkbits;
927 int ret = 0;
928
929 while (sdio->block_in_file < sdio->final_block_in_request) {
930 struct page *page;
931 size_t from, to;
932
933 page = dio_get_page(dio, sdio);
934 if (IS_ERR(page)) {
935 ret = PTR_ERR(page);
936 goto out;
937 }
938 from = sdio->head ? 0 : sdio->from;
939 to = (sdio->head == sdio->tail - 1) ? sdio->to : PAGE_SIZE;
940 sdio->head++;
941
942 while (from < to) {
943 unsigned this_chunk_bytes;
944 unsigned this_chunk_blocks;
945 unsigned u;
946
947 if (sdio->blocks_available == 0) {
948
949
950
951 unsigned long blkmask;
952 unsigned long dio_remainder;
953
954 ret = get_more_blocks(dio, sdio, map_bh);
955 if (ret) {
956 put_page(page);
957 goto out;
958 }
959 if (!buffer_mapped(map_bh))
960 goto do_holes;
961
962 sdio->blocks_available =
963 map_bh->b_size >> sdio->blkbits;
964 sdio->next_block_for_io =
965 map_bh->b_blocknr << sdio->blkfactor;
966 if (buffer_new(map_bh))
967 clean_blockdev_aliases(dio, map_bh);
968
969 if (!sdio->blkfactor)
970 goto do_holes;
971
972 blkmask = (1 << sdio->blkfactor) - 1;
973 dio_remainder = (sdio->block_in_file & blkmask);
974
975
976
977
978
979
980
981
982
983
984
985
986 if (!buffer_new(map_bh))
987 sdio->next_block_for_io += dio_remainder;
988 sdio->blocks_available -= dio_remainder;
989 }
990do_holes:
991
992 if (!buffer_mapped(map_bh)) {
993 loff_t i_size_aligned;
994
995
996 if (dio->op == REQ_OP_WRITE) {
997 put_page(page);
998 return -ENOTBLK;
999 }
1000
1001
1002
1003
1004
1005 i_size_aligned = ALIGN(i_size_read(dio->inode),
1006 1 << blkbits);
1007 if (sdio->block_in_file >=
1008 i_size_aligned >> blkbits) {
1009
1010 put_page(page);
1011 goto out;
1012 }
1013 zero_user(page, from, 1 << blkbits);
1014 sdio->block_in_file++;
1015 from += 1 << blkbits;
1016 dio->result += 1 << blkbits;
1017 goto next_block;
1018 }
1019
1020
1021
1022
1023
1024
1025 if (unlikely(sdio->blkfactor && !sdio->start_zero_done))
1026 dio_zero_block(dio, sdio, 0, map_bh);
1027
1028
1029
1030
1031
1032 this_chunk_blocks = sdio->blocks_available;
1033 u = (to - from) >> blkbits;
1034 if (this_chunk_blocks > u)
1035 this_chunk_blocks = u;
1036 u = sdio->final_block_in_request - sdio->block_in_file;
1037 if (this_chunk_blocks > u)
1038 this_chunk_blocks = u;
1039 this_chunk_bytes = this_chunk_blocks << blkbits;
1040 BUG_ON(this_chunk_bytes == 0);
1041
1042 if (this_chunk_blocks == sdio->blocks_available)
1043 sdio->boundary = buffer_boundary(map_bh);
1044 ret = submit_page_section(dio, sdio, page,
1045 from,
1046 this_chunk_bytes,
1047 sdio->next_block_for_io,
1048 map_bh);
1049 if (ret) {
1050 put_page(page);
1051 goto out;
1052 }
1053 sdio->next_block_for_io += this_chunk_blocks;
1054
1055 sdio->block_in_file += this_chunk_blocks;
1056 from += this_chunk_bytes;
1057 dio->result += this_chunk_bytes;
1058 sdio->blocks_available -= this_chunk_blocks;
1059next_block:
1060 BUG_ON(sdio->block_in_file > sdio->final_block_in_request);
1061 if (sdio->block_in_file == sdio->final_block_in_request)
1062 break;
1063 }
1064
1065
1066 put_page(page);
1067 }
1068out:
1069 return ret;
1070}
1071
1072static inline int drop_refcount(struct dio *dio)
1073{
1074 int ret2;
1075 unsigned long flags;
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088 spin_lock_irqsave(&dio->bio_lock, flags);
1089 ret2 = --dio->refcount;
1090 spin_unlock_irqrestore(&dio->bio_lock, flags);
1091 return ret2;
1092}
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119static inline ssize_t
1120do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
1121 struct block_device *bdev, struct iov_iter *iter,
1122 get_block_t get_block, dio_iodone_t end_io,
1123 dio_submit_t submit_io, int flags)
1124{
1125 unsigned i_blkbits = ACCESS_ONCE(inode->i_blkbits);
1126 unsigned blkbits = i_blkbits;
1127 unsigned blocksize_mask = (1 << blkbits) - 1;
1128 ssize_t retval = -EINVAL;
1129 size_t count = iov_iter_count(iter);
1130 loff_t offset = iocb->ki_pos;
1131 loff_t end = offset + count;
1132 struct dio *dio;
1133 struct dio_submit sdio = { 0, };
1134 struct buffer_head map_bh = { 0, };
1135 struct blk_plug plug;
1136 unsigned long align = offset | iov_iter_alignment(iter);
1137
1138
1139
1140
1141
1142
1143 if (align & blocksize_mask) {
1144 if (bdev)
1145 blkbits = blksize_bits(bdev_logical_block_size(bdev));
1146 blocksize_mask = (1 << blkbits) - 1;
1147 if (align & blocksize_mask)
1148 goto out;
1149 }
1150
1151
1152 if (iov_iter_rw(iter) == READ && !iov_iter_count(iter))
1153 return 0;
1154
1155 dio = kmem_cache_alloc(dio_cache, GFP_KERNEL);
1156 retval = -ENOMEM;
1157 if (!dio)
1158 goto out;
1159
1160
1161
1162
1163
1164 memset(dio, 0, offsetof(struct dio, pages));
1165
1166 dio->flags = flags;
1167 if (dio->flags & DIO_LOCKING) {
1168 if (iov_iter_rw(iter) == READ) {
1169 struct address_space *mapping =
1170 iocb->ki_filp->f_mapping;
1171
1172
1173 inode_lock(inode);
1174
1175 retval = filemap_write_and_wait_range(mapping, offset,
1176 end - 1);
1177 if (retval) {
1178 inode_unlock(inode);
1179 kmem_cache_free(dio_cache, dio);
1180 goto out;
1181 }
1182 }
1183 }
1184
1185
1186 dio->i_size = i_size_read(inode);
1187 if (iov_iter_rw(iter) == READ && offset >= dio->i_size) {
1188 if (dio->flags & DIO_LOCKING)
1189 inode_unlock(inode);
1190 kmem_cache_free(dio_cache, dio);
1191 retval = 0;
1192 goto out;
1193 }
1194
1195
1196
1197
1198
1199
1200
1201 if (is_sync_kiocb(iocb))
1202 dio->is_async = false;
1203 else if (!(dio->flags & DIO_ASYNC_EXTEND) &&
1204 iov_iter_rw(iter) == WRITE && end > i_size_read(inode))
1205 dio->is_async = false;
1206 else
1207 dio->is_async = true;
1208
1209 dio->inode = inode;
1210 if (iov_iter_rw(iter) == WRITE) {
1211 dio->op = REQ_OP_WRITE;
1212 dio->op_flags = WRITE_ODIRECT;
1213 } else {
1214 dio->op = REQ_OP_READ;
1215 }
1216
1217
1218
1219
1220
1221 if (dio->is_async && iov_iter_rw(iter) == WRITE &&
1222 ((iocb->ki_filp->f_flags & O_DSYNC) ||
1223 IS_SYNC(iocb->ki_filp->f_mapping->host))) {
1224 retval = dio_set_defer_completion(dio);
1225 if (retval) {
1226
1227
1228
1229
1230 kmem_cache_free(dio_cache, dio);
1231 goto out;
1232 }
1233 }
1234
1235
1236
1237
1238 if (!(dio->flags & DIO_SKIP_DIO_COUNT))
1239 inode_dio_begin(inode);
1240
1241 retval = 0;
1242 sdio.blkbits = blkbits;
1243 sdio.blkfactor = i_blkbits - blkbits;
1244 sdio.block_in_file = offset >> blkbits;
1245
1246 sdio.get_block = get_block;
1247 dio->end_io = end_io;
1248 sdio.submit_io = submit_io;
1249 sdio.final_block_in_bio = -1;
1250 sdio.next_block_for_io = -1;
1251
1252 dio->iocb = iocb;
1253
1254 spin_lock_init(&dio->bio_lock);
1255 dio->refcount = 1;
1256
1257 dio->should_dirty = (iter->type == ITER_IOVEC);
1258 sdio.iter = iter;
1259 sdio.final_block_in_request =
1260 (offset + iov_iter_count(iter)) >> blkbits;
1261
1262
1263
1264
1265
1266 if (unlikely(sdio.blkfactor))
1267 sdio.pages_in_io = 2;
1268
1269 sdio.pages_in_io += iov_iter_npages(iter, INT_MAX);
1270
1271 blk_start_plug(&plug);
1272
1273 retval = do_direct_IO(dio, &sdio, &map_bh);
1274 if (retval)
1275 dio_cleanup(dio, &sdio);
1276
1277 if (retval == -ENOTBLK) {
1278
1279
1280
1281
1282 retval = 0;
1283 }
1284
1285
1286
1287
1288 dio_zero_block(dio, &sdio, 1, &map_bh);
1289
1290 if (sdio.cur_page) {
1291 ssize_t ret2;
1292
1293 ret2 = dio_send_cur_page(dio, &sdio, &map_bh);
1294 if (retval == 0)
1295 retval = ret2;
1296 put_page(sdio.cur_page);
1297 sdio.cur_page = NULL;
1298 }
1299 if (sdio.bio)
1300 dio_bio_submit(dio, &sdio);
1301
1302 blk_finish_plug(&plug);
1303
1304
1305
1306
1307
1308 dio_cleanup(dio, &sdio);
1309
1310
1311
1312
1313
1314
1315 if (iov_iter_rw(iter) == READ && (dio->flags & DIO_LOCKING))
1316 inode_unlock(dio->inode);
1317
1318
1319
1320
1321
1322
1323
1324
1325 BUG_ON(retval == -EIOCBQUEUED);
1326 if (dio->is_async && retval == 0 && dio->result &&
1327 (iov_iter_rw(iter) == READ || dio->result == count))
1328 retval = -EIOCBQUEUED;
1329 else
1330 dio_await_completion(dio);
1331
1332 if (drop_refcount(dio) == 0) {
1333 retval = dio_complete(dio, retval, false);
1334 } else
1335 BUG_ON(retval != -EIOCBQUEUED);
1336
1337out:
1338 return retval;
1339}
1340
1341ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
1342 struct block_device *bdev, struct iov_iter *iter,
1343 get_block_t get_block,
1344 dio_iodone_t end_io, dio_submit_t submit_io,
1345 int flags)
1346{
1347
1348
1349
1350
1351
1352
1353
1354
1355 prefetch(&bdev->bd_disk->part_tbl);
1356 prefetch(bdev->bd_queue);
1357 prefetch((char *)bdev->bd_queue + SMP_CACHE_BYTES);
1358
1359 return do_blockdev_direct_IO(iocb, inode, bdev, iter, get_block,
1360 end_io, submit_io, flags);
1361}
1362
1363EXPORT_SYMBOL(__blockdev_direct_IO);
1364
1365static __init int dio_init(void)
1366{
1367 dio_cache = KMEM_CACHE(dio, SLAB_PANIC);
1368 return 0;
1369}
1370module_init(dio_init)
1371