Nie możesz wybrać więcej, niż 25 tematów Tematy muszą się zaczynać od litery lub cyfry, mogą zawierać myślniki ('-') i mogą mieć do 35 znaków.

3179 lines
95KB

  1. /*
  2. * Block layer I/O functions
  3. *
  4. * Copyright (c) 2003 Fabrice Bellard
  5. *
  6. * Permission is hereby granted, free of charge, to any person obtaining a copy
  7. * of this software and associated documentation files (the "Software"), to deal
  8. * in the Software without restriction, including without limitation the rights
  9. * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10. * copies of the Software, and to permit persons to whom the Software is
  11. * furnished to do so, subject to the following conditions:
  12. *
  13. * The above copyright notice and this permission notice shall be included in
  14. * all copies or substantial portions of the Software.
  15. *
  16. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17. * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18. * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19. * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20. * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21. * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22. * THE SOFTWARE.
  23. */
  24. #include "qemu/osdep.h"
  25. #include "trace.h"
  26. #include "sysemu/block-backend.h"
  27. #include "block/aio-wait.h"
  28. #include "block/blockjob.h"
  29. #include "block/blockjob_int.h"
  30. #include "block/block_int.h"
  31. #include "qemu/cutils.h"
  32. #include "qapi/error.h"
  33. #include "qemu/error-report.h"
  34. #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
  35. /* Maximum bounce buffer for copy-on-read and write zeroes, in bytes */
  36. #define MAX_BOUNCE_BUFFER (32768 << BDRV_SECTOR_BITS)
  37. static void bdrv_parent_cb_resize(BlockDriverState *bs);
  38. static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
  39. int64_t offset, int bytes, BdrvRequestFlags flags);
  40. void bdrv_parent_drained_begin(BlockDriverState *bs, BdrvChild *ignore,
  41. bool ignore_bds_parents)
  42. {
  43. BdrvChild *c, *next;
  44. QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) {
  45. if (c == ignore || (ignore_bds_parents && c->role->parent_is_bds)) {
  46. continue;
  47. }
  48. bdrv_parent_drained_begin_single(c, false);
  49. }
  50. }
  51. void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore,
  52. bool ignore_bds_parents)
  53. {
  54. BdrvChild *c, *next;
  55. QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) {
  56. if (c == ignore || (ignore_bds_parents && c->role->parent_is_bds)) {
  57. continue;
  58. }
  59. if (c->role->drained_end) {
  60. c->role->drained_end(c);
  61. }
  62. }
  63. }
  64. static bool bdrv_parent_drained_poll_single(BdrvChild *c)
  65. {
  66. if (c->role->drained_poll) {
  67. return c->role->drained_poll(c);
  68. }
  69. return false;
  70. }
  71. static bool bdrv_parent_drained_poll(BlockDriverState *bs, BdrvChild *ignore,
  72. bool ignore_bds_parents)
  73. {
  74. BdrvChild *c, *next;
  75. bool busy = false;
  76. QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) {
  77. if (c == ignore || (ignore_bds_parents && c->role->parent_is_bds)) {
  78. continue;
  79. }
  80. busy |= bdrv_parent_drained_poll_single(c);
  81. }
  82. return busy;
  83. }
  84. void bdrv_parent_drained_begin_single(BdrvChild *c, bool poll)
  85. {
  86. if (c->role->drained_begin) {
  87. c->role->drained_begin(c);
  88. }
  89. if (poll) {
  90. BDRV_POLL_WHILE(c->bs, bdrv_parent_drained_poll_single(c));
  91. }
  92. }
  93. static void bdrv_merge_limits(BlockLimits *dst, const BlockLimits *src)
  94. {
  95. dst->opt_transfer = MAX(dst->opt_transfer, src->opt_transfer);
  96. dst->max_transfer = MIN_NON_ZERO(dst->max_transfer, src->max_transfer);
  97. dst->opt_mem_alignment = MAX(dst->opt_mem_alignment,
  98. src->opt_mem_alignment);
  99. dst->min_mem_alignment = MAX(dst->min_mem_alignment,
  100. src->min_mem_alignment);
  101. dst->max_iov = MIN_NON_ZERO(dst->max_iov, src->max_iov);
  102. }
  103. void bdrv_refresh_limits(BlockDriverState *bs, Error **errp)
  104. {
  105. BlockDriver *drv = bs->drv;
  106. Error *local_err = NULL;
  107. memset(&bs->bl, 0, sizeof(bs->bl));
  108. if (!drv) {
  109. return;
  110. }
  111. /* Default alignment based on whether driver has byte interface */
  112. bs->bl.request_alignment = (drv->bdrv_co_preadv ||
  113. drv->bdrv_aio_preadv) ? 1 : 512;
  114. /* Take some limits from the children as a default */
  115. if (bs->file) {
  116. bdrv_refresh_limits(bs->file->bs, &local_err);
  117. if (local_err) {
  118. error_propagate(errp, local_err);
  119. return;
  120. }
  121. bdrv_merge_limits(&bs->bl, &bs->file->bs->bl);
  122. } else {
  123. bs->bl.min_mem_alignment = 512;
  124. bs->bl.opt_mem_alignment = getpagesize();
  125. /* Safe default since most protocols use readv()/writev()/etc */
  126. bs->bl.max_iov = IOV_MAX;
  127. }
  128. if (bs->backing) {
  129. bdrv_refresh_limits(bs->backing->bs, &local_err);
  130. if (local_err) {
  131. error_propagate(errp, local_err);
  132. return;
  133. }
  134. bdrv_merge_limits(&bs->bl, &bs->backing->bs->bl);
  135. }
  136. /* Then let the driver override it */
  137. if (drv->bdrv_refresh_limits) {
  138. drv->bdrv_refresh_limits(bs, errp);
  139. }
  140. }
  141. /**
  142. * The copy-on-read flag is actually a reference count so multiple users may
  143. * use the feature without worrying about clobbering its previous state.
  144. * Copy-on-read stays enabled until all users have called to disable it.
  145. */
  146. void bdrv_enable_copy_on_read(BlockDriverState *bs)
  147. {
  148. atomic_inc(&bs->copy_on_read);
  149. }
  150. void bdrv_disable_copy_on_read(BlockDriverState *bs)
  151. {
  152. int old = atomic_fetch_dec(&bs->copy_on_read);
  153. assert(old >= 1);
  154. }
  155. typedef struct {
  156. Coroutine *co;
  157. BlockDriverState *bs;
  158. bool done;
  159. bool begin;
  160. bool recursive;
  161. bool poll;
  162. BdrvChild *parent;
  163. bool ignore_bds_parents;
  164. } BdrvCoDrainData;
  165. static void coroutine_fn bdrv_drain_invoke_entry(void *opaque)
  166. {
  167. BdrvCoDrainData *data = opaque;
  168. BlockDriverState *bs = data->bs;
  169. if (data->begin) {
  170. bs->drv->bdrv_co_drain_begin(bs);
  171. } else {
  172. bs->drv->bdrv_co_drain_end(bs);
  173. }
  174. /* Set data->done before reading bs->wakeup. */
  175. atomic_mb_set(&data->done, true);
  176. bdrv_dec_in_flight(bs);
  177. if (data->begin) {
  178. g_free(data);
  179. }
  180. }
  181. /* Recursively call BlockDriver.bdrv_co_drain_begin/end callbacks */
  182. static void bdrv_drain_invoke(BlockDriverState *bs, bool begin)
  183. {
  184. BdrvCoDrainData *data;
  185. if (!bs->drv || (begin && !bs->drv->bdrv_co_drain_begin) ||
  186. (!begin && !bs->drv->bdrv_co_drain_end)) {
  187. return;
  188. }
  189. data = g_new(BdrvCoDrainData, 1);
  190. *data = (BdrvCoDrainData) {
  191. .bs = bs,
  192. .done = false,
  193. .begin = begin
  194. };
  195. /* Make sure the driver callback completes during the polling phase for
  196. * drain_begin. */
  197. bdrv_inc_in_flight(bs);
  198. data->co = qemu_coroutine_create(bdrv_drain_invoke_entry, data);
  199. aio_co_schedule(bdrv_get_aio_context(bs), data->co);
  200. if (!begin) {
  201. BDRV_POLL_WHILE(bs, !data->done);
  202. g_free(data);
  203. }
  204. }
  205. /* Returns true if BDRV_POLL_WHILE() should go into a blocking aio_poll() */
  206. bool bdrv_drain_poll(BlockDriverState *bs, bool recursive,
  207. BdrvChild *ignore_parent, bool ignore_bds_parents)
  208. {
  209. BdrvChild *child, *next;
  210. if (bdrv_parent_drained_poll(bs, ignore_parent, ignore_bds_parents)) {
  211. return true;
  212. }
  213. if (atomic_read(&bs->in_flight)) {
  214. return true;
  215. }
  216. if (recursive) {
  217. assert(!ignore_bds_parents);
  218. QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
  219. if (bdrv_drain_poll(child->bs, recursive, child, false)) {
  220. return true;
  221. }
  222. }
  223. }
  224. return false;
  225. }
  226. static bool bdrv_drain_poll_top_level(BlockDriverState *bs, bool recursive,
  227. BdrvChild *ignore_parent)
  228. {
  229. return bdrv_drain_poll(bs, recursive, ignore_parent, false);
  230. }
  231. static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
  232. BdrvChild *parent, bool ignore_bds_parents,
  233. bool poll);
  234. static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
  235. BdrvChild *parent, bool ignore_bds_parents);
  236. static void bdrv_co_drain_bh_cb(void *opaque)
  237. {
  238. BdrvCoDrainData *data = opaque;
  239. Coroutine *co = data->co;
  240. BlockDriverState *bs = data->bs;
  241. if (bs) {
  242. AioContext *ctx = bdrv_get_aio_context(bs);
  243. AioContext *co_ctx = qemu_coroutine_get_aio_context(co);
  244. /*
  245. * When the coroutine yielded, the lock for its home context was
  246. * released, so we need to re-acquire it here. If it explicitly
  247. * acquired a different context, the lock is still held and we don't
  248. * want to lock it a second time (or AIO_WAIT_WHILE() would hang).
  249. */
  250. if (ctx == co_ctx) {
  251. aio_context_acquire(ctx);
  252. }
  253. bdrv_dec_in_flight(bs);
  254. if (data->begin) {
  255. bdrv_do_drained_begin(bs, data->recursive, data->parent,
  256. data->ignore_bds_parents, data->poll);
  257. } else {
  258. bdrv_do_drained_end(bs, data->recursive, data->parent,
  259. data->ignore_bds_parents);
  260. }
  261. if (ctx == co_ctx) {
  262. aio_context_release(ctx);
  263. }
  264. } else {
  265. assert(data->begin);
  266. bdrv_drain_all_begin();
  267. }
  268. data->done = true;
  269. aio_co_wake(co);
  270. }
  271. static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
  272. bool begin, bool recursive,
  273. BdrvChild *parent,
  274. bool ignore_bds_parents,
  275. bool poll)
  276. {
  277. BdrvCoDrainData data;
  278. /* Calling bdrv_drain() from a BH ensures the current coroutine yields and
  279. * other coroutines run if they were queued by aio_co_enter(). */
  280. assert(qemu_in_coroutine());
  281. data = (BdrvCoDrainData) {
  282. .co = qemu_coroutine_self(),
  283. .bs = bs,
  284. .done = false,
  285. .begin = begin,
  286. .recursive = recursive,
  287. .parent = parent,
  288. .ignore_bds_parents = ignore_bds_parents,
  289. .poll = poll,
  290. };
  291. if (bs) {
  292. bdrv_inc_in_flight(bs);
  293. }
  294. aio_bh_schedule_oneshot(bdrv_get_aio_context(bs),
  295. bdrv_co_drain_bh_cb, &data);
  296. qemu_coroutine_yield();
  297. /* If we are resumed from some other event (such as an aio completion or a
  298. * timer callback), it is a bug in the caller that should be fixed. */
  299. assert(data.done);
  300. }
  301. void bdrv_do_drained_begin_quiesce(BlockDriverState *bs,
  302. BdrvChild *parent, bool ignore_bds_parents)
  303. {
  304. assert(!qemu_in_coroutine());
  305. /* Stop things in parent-to-child order */
  306. if (atomic_fetch_inc(&bs->quiesce_counter) == 0) {
  307. aio_disable_external(bdrv_get_aio_context(bs));
  308. }
  309. bdrv_parent_drained_begin(bs, parent, ignore_bds_parents);
  310. bdrv_drain_invoke(bs, true);
  311. }
  312. static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
  313. BdrvChild *parent, bool ignore_bds_parents,
  314. bool poll)
  315. {
  316. BdrvChild *child, *next;
  317. if (qemu_in_coroutine()) {
  318. bdrv_co_yield_to_drain(bs, true, recursive, parent, ignore_bds_parents,
  319. poll);
  320. return;
  321. }
  322. bdrv_do_drained_begin_quiesce(bs, parent, ignore_bds_parents);
  323. if (recursive) {
  324. assert(!ignore_bds_parents);
  325. bs->recursive_quiesce_counter++;
  326. QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
  327. bdrv_do_drained_begin(child->bs, true, child, ignore_bds_parents,
  328. false);
  329. }
  330. }
  331. /*
  332. * Wait for drained requests to finish.
  333. *
  334. * Calling BDRV_POLL_WHILE() only once for the top-level node is okay: The
  335. * call is needed so things in this AioContext can make progress even
  336. * though we don't return to the main AioContext loop - this automatically
  337. * includes other nodes in the same AioContext and therefore all child
  338. * nodes.
  339. */
  340. if (poll) {
  341. assert(!ignore_bds_parents);
  342. BDRV_POLL_WHILE(bs, bdrv_drain_poll_top_level(bs, recursive, parent));
  343. }
  344. }
  345. void bdrv_drained_begin(BlockDriverState *bs)
  346. {
  347. bdrv_do_drained_begin(bs, false, NULL, false, true);
  348. }
  349. void bdrv_subtree_drained_begin(BlockDriverState *bs)
  350. {
  351. bdrv_do_drained_begin(bs, true, NULL, false, true);
  352. }
  353. static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
  354. BdrvChild *parent, bool ignore_bds_parents)
  355. {
  356. BdrvChild *child, *next;
  357. int old_quiesce_counter;
  358. if (qemu_in_coroutine()) {
  359. bdrv_co_yield_to_drain(bs, false, recursive, parent, ignore_bds_parents,
  360. false);
  361. return;
  362. }
  363. assert(bs->quiesce_counter > 0);
  364. old_quiesce_counter = atomic_fetch_dec(&bs->quiesce_counter);
  365. /* Re-enable things in child-to-parent order */
  366. bdrv_drain_invoke(bs, false);
  367. bdrv_parent_drained_end(bs, parent, ignore_bds_parents);
  368. if (old_quiesce_counter == 1) {
  369. aio_enable_external(bdrv_get_aio_context(bs));
  370. }
  371. if (recursive) {
  372. assert(!ignore_bds_parents);
  373. bs->recursive_quiesce_counter--;
  374. QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
  375. bdrv_do_drained_end(child->bs, true, child, ignore_bds_parents);
  376. }
  377. }
  378. }
  379. void bdrv_drained_end(BlockDriverState *bs)
  380. {
  381. bdrv_do_drained_end(bs, false, NULL, false);
  382. }
  383. void bdrv_subtree_drained_end(BlockDriverState *bs)
  384. {
  385. bdrv_do_drained_end(bs, true, NULL, false);
  386. }
  387. void bdrv_apply_subtree_drain(BdrvChild *child, BlockDriverState *new_parent)
  388. {
  389. int i;
  390. for (i = 0; i < new_parent->recursive_quiesce_counter; i++) {
  391. bdrv_do_drained_begin(child->bs, true, child, false, true);
  392. }
  393. }
  394. void bdrv_unapply_subtree_drain(BdrvChild *child, BlockDriverState *old_parent)
  395. {
  396. int i;
  397. for (i = 0; i < old_parent->recursive_quiesce_counter; i++) {
  398. bdrv_do_drained_end(child->bs, true, child, false);
  399. }
  400. }
  401. /*
  402. * Wait for pending requests to complete on a single BlockDriverState subtree,
  403. * and suspend block driver's internal I/O until next request arrives.
  404. *
  405. * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState
  406. * AioContext.
  407. */
  408. void coroutine_fn bdrv_co_drain(BlockDriverState *bs)
  409. {
  410. assert(qemu_in_coroutine());
  411. bdrv_drained_begin(bs);
  412. bdrv_drained_end(bs);
  413. }
  414. void bdrv_drain(BlockDriverState *bs)
  415. {
  416. bdrv_drained_begin(bs);
  417. bdrv_drained_end(bs);
  418. }
  419. static void bdrv_drain_assert_idle(BlockDriverState *bs)
  420. {
  421. BdrvChild *child, *next;
  422. assert(atomic_read(&bs->in_flight) == 0);
  423. QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
  424. bdrv_drain_assert_idle(child->bs);
  425. }
  426. }
  427. unsigned int bdrv_drain_all_count = 0;
  428. static bool bdrv_drain_all_poll(void)
  429. {
  430. BlockDriverState *bs = NULL;
  431. bool result = false;
  432. /* bdrv_drain_poll() can't make changes to the graph and we are holding the
  433. * main AioContext lock, so iterating bdrv_next_all_states() is safe. */
  434. while ((bs = bdrv_next_all_states(bs))) {
  435. AioContext *aio_context = bdrv_get_aio_context(bs);
  436. aio_context_acquire(aio_context);
  437. result |= bdrv_drain_poll(bs, false, NULL, true);
  438. aio_context_release(aio_context);
  439. }
  440. return result;
  441. }
  442. /*
  443. * Wait for pending requests to complete across all BlockDriverStates
  444. *
  445. * This function does not flush data to disk, use bdrv_flush_all() for that
  446. * after calling this function.
  447. *
  448. * This pauses all block jobs and disables external clients. It must
  449. * be paired with bdrv_drain_all_end().
  450. *
  451. * NOTE: no new block jobs or BlockDriverStates can be created between
  452. * the bdrv_drain_all_begin() and bdrv_drain_all_end() calls.
  453. */
  454. void bdrv_drain_all_begin(void)
  455. {
  456. BlockDriverState *bs = NULL;
  457. if (qemu_in_coroutine()) {
  458. bdrv_co_yield_to_drain(NULL, true, false, NULL, true, true);
  459. return;
  460. }
  461. /* AIO_WAIT_WHILE() with a NULL context can only be called from the main
  462. * loop AioContext, so make sure we're in the main context. */
  463. assert(qemu_get_current_aio_context() == qemu_get_aio_context());
  464. assert(bdrv_drain_all_count < INT_MAX);
  465. bdrv_drain_all_count++;
  466. /* Quiesce all nodes, without polling in-flight requests yet. The graph
  467. * cannot change during this loop. */
  468. while ((bs = bdrv_next_all_states(bs))) {
  469. AioContext *aio_context = bdrv_get_aio_context(bs);
  470. aio_context_acquire(aio_context);
  471. bdrv_do_drained_begin(bs, false, NULL, true, false);
  472. aio_context_release(aio_context);
  473. }
  474. /* Now poll the in-flight requests */
  475. AIO_WAIT_WHILE(NULL, bdrv_drain_all_poll());
  476. while ((bs = bdrv_next_all_states(bs))) {
  477. bdrv_drain_assert_idle(bs);
  478. }
  479. }
  480. void bdrv_drain_all_end(void)
  481. {
  482. BlockDriverState *bs = NULL;
  483. while ((bs = bdrv_next_all_states(bs))) {
  484. AioContext *aio_context = bdrv_get_aio_context(bs);
  485. aio_context_acquire(aio_context);
  486. bdrv_do_drained_end(bs, false, NULL, true);
  487. aio_context_release(aio_context);
  488. }
  489. assert(bdrv_drain_all_count > 0);
  490. bdrv_drain_all_count--;
  491. }
  492. void bdrv_drain_all(void)
  493. {
  494. bdrv_drain_all_begin();
  495. bdrv_drain_all_end();
  496. }
  497. /**
  498. * Remove an active request from the tracked requests list
  499. *
  500. * This function should be called when a tracked request is completing.
  501. */
  502. static void tracked_request_end(BdrvTrackedRequest *req)
  503. {
  504. if (req->serialising) {
  505. atomic_dec(&req->bs->serialising_in_flight);
  506. }
  507. qemu_co_mutex_lock(&req->bs->reqs_lock);
  508. QLIST_REMOVE(req, list);
  509. qemu_co_queue_restart_all(&req->wait_queue);
  510. qemu_co_mutex_unlock(&req->bs->reqs_lock);
  511. }
  512. /**
  513. * Add an active request to the tracked requests list
  514. */
  515. static void tracked_request_begin(BdrvTrackedRequest *req,
  516. BlockDriverState *bs,
  517. int64_t offset,
  518. uint64_t bytes,
  519. enum BdrvTrackedRequestType type)
  520. {
  521. assert(bytes <= INT64_MAX && offset <= INT64_MAX - bytes);
  522. *req = (BdrvTrackedRequest){
  523. .bs = bs,
  524. .offset = offset,
  525. .bytes = bytes,
  526. .type = type,
  527. .co = qemu_coroutine_self(),
  528. .serialising = false,
  529. .overlap_offset = offset,
  530. .overlap_bytes = bytes,
  531. };
  532. qemu_co_queue_init(&req->wait_queue);
  533. qemu_co_mutex_lock(&bs->reqs_lock);
  534. QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
  535. qemu_co_mutex_unlock(&bs->reqs_lock);
  536. }
  537. static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align)
  538. {
  539. int64_t overlap_offset = req->offset & ~(align - 1);
  540. uint64_t overlap_bytes = ROUND_UP(req->offset + req->bytes, align)
  541. - overlap_offset;
  542. if (!req->serialising) {
  543. atomic_inc(&req->bs->serialising_in_flight);
  544. req->serialising = true;
  545. }
  546. req->overlap_offset = MIN(req->overlap_offset, overlap_offset);
  547. req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes);
  548. }
  549. static bool is_request_serialising_and_aligned(BdrvTrackedRequest *req)
  550. {
  551. /*
  552. * If the request is serialising, overlap_offset and overlap_bytes are set,
  553. * so we can check if the request is aligned. Otherwise, don't care and
  554. * return false.
  555. */
  556. return req->serialising && (req->offset == req->overlap_offset) &&
  557. (req->bytes == req->overlap_bytes);
  558. }
  559. /**
  560. * Round a region to cluster boundaries
  561. */
  562. void bdrv_round_to_clusters(BlockDriverState *bs,
  563. int64_t offset, int64_t bytes,
  564. int64_t *cluster_offset,
  565. int64_t *cluster_bytes)
  566. {
  567. BlockDriverInfo bdi;
  568. if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
  569. *cluster_offset = offset;
  570. *cluster_bytes = bytes;
  571. } else {
  572. int64_t c = bdi.cluster_size;
  573. *cluster_offset = QEMU_ALIGN_DOWN(offset, c);
  574. *cluster_bytes = QEMU_ALIGN_UP(offset - *cluster_offset + bytes, c);
  575. }
  576. }
  577. static int bdrv_get_cluster_size(BlockDriverState *bs)
  578. {
  579. BlockDriverInfo bdi;
  580. int ret;
  581. ret = bdrv_get_info(bs, &bdi);
  582. if (ret < 0 || bdi.cluster_size == 0) {
  583. return bs->bl.request_alignment;
  584. } else {
  585. return bdi.cluster_size;
  586. }
  587. }
  588. static bool tracked_request_overlaps(BdrvTrackedRequest *req,
  589. int64_t offset, uint64_t bytes)
  590. {
  591. /* aaaa bbbb */
  592. if (offset >= req->overlap_offset + req->overlap_bytes) {
  593. return false;
  594. }
  595. /* bbbb aaaa */
  596. if (req->overlap_offset >= offset + bytes) {
  597. return false;
  598. }
  599. return true;
  600. }
  601. void bdrv_inc_in_flight(BlockDriverState *bs)
  602. {
  603. atomic_inc(&bs->in_flight);
  604. }
  605. void bdrv_wakeup(BlockDriverState *bs)
  606. {
  607. aio_wait_kick();
  608. }
  609. void bdrv_dec_in_flight(BlockDriverState *bs)
  610. {
  611. atomic_dec(&bs->in_flight);
  612. bdrv_wakeup(bs);
  613. }
  614. static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
  615. {
  616. BlockDriverState *bs = self->bs;
  617. BdrvTrackedRequest *req;
  618. bool retry;
  619. bool waited = false;
  620. if (!atomic_read(&bs->serialising_in_flight)) {
  621. return false;
  622. }
  623. do {
  624. retry = false;
  625. qemu_co_mutex_lock(&bs->reqs_lock);
  626. QLIST_FOREACH(req, &bs->tracked_requests, list) {
  627. if (req == self || (!req->serialising && !self->serialising)) {
  628. continue;
  629. }
  630. if (tracked_request_overlaps(req, self->overlap_offset,
  631. self->overlap_bytes))
  632. {
  633. /* Hitting this means there was a reentrant request, for
  634. * example, a block driver issuing nested requests. This must
  635. * never happen since it means deadlock.
  636. */
  637. assert(qemu_coroutine_self() != req->co);
  638. /* If the request is already (indirectly) waiting for us, or
  639. * will wait for us as soon as it wakes up, then just go on
  640. * (instead of producing a deadlock in the former case). */
  641. if (!req->waiting_for) {
  642. self->waiting_for = req;
  643. qemu_co_queue_wait(&req->wait_queue, &bs->reqs_lock);
  644. self->waiting_for = NULL;
  645. retry = true;
  646. waited = true;
  647. break;
  648. }
  649. }
  650. }
  651. qemu_co_mutex_unlock(&bs->reqs_lock);
  652. } while (retry);
  653. return waited;
  654. }
  655. static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
  656. size_t size)
  657. {
  658. if (size > BDRV_REQUEST_MAX_BYTES) {
  659. return -EIO;
  660. }
  661. if (!bdrv_is_inserted(bs)) {
  662. return -ENOMEDIUM;
  663. }
  664. if (offset < 0) {
  665. return -EIO;
  666. }
  667. return 0;
  668. }
  669. typedef struct RwCo {
  670. BdrvChild *child;
  671. int64_t offset;
  672. QEMUIOVector *qiov;
  673. bool is_write;
  674. int ret;
  675. BdrvRequestFlags flags;
  676. } RwCo;
  677. static void coroutine_fn bdrv_rw_co_entry(void *opaque)
  678. {
  679. RwCo *rwco = opaque;
  680. if (!rwco->is_write) {
  681. rwco->ret = bdrv_co_preadv(rwco->child, rwco->offset,
  682. rwco->qiov->size, rwco->qiov,
  683. rwco->flags);
  684. } else {
  685. rwco->ret = bdrv_co_pwritev(rwco->child, rwco->offset,
  686. rwco->qiov->size, rwco->qiov,
  687. rwco->flags);
  688. }
  689. aio_wait_kick();
  690. }
  691. /*
  692. * Process a vectored synchronous request using coroutines
  693. */
  694. static int bdrv_prwv_co(BdrvChild *child, int64_t offset,
  695. QEMUIOVector *qiov, bool is_write,
  696. BdrvRequestFlags flags)
  697. {
  698. Coroutine *co;
  699. RwCo rwco = {
  700. .child = child,
  701. .offset = offset,
  702. .qiov = qiov,
  703. .is_write = is_write,
  704. .ret = NOT_DONE,
  705. .flags = flags,
  706. };
  707. if (qemu_in_coroutine()) {
  708. /* Fast-path if already in coroutine context */
  709. bdrv_rw_co_entry(&rwco);
  710. } else {
  711. co = qemu_coroutine_create(bdrv_rw_co_entry, &rwco);
  712. bdrv_coroutine_enter(child->bs, co);
  713. BDRV_POLL_WHILE(child->bs, rwco.ret == NOT_DONE);
  714. }
  715. return rwco.ret;
  716. }
  717. int bdrv_pwrite_zeroes(BdrvChild *child, int64_t offset,
  718. int bytes, BdrvRequestFlags flags)
  719. {
  720. QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, NULL, bytes);
  721. return bdrv_prwv_co(child, offset, &qiov, true,
  722. BDRV_REQ_ZERO_WRITE | flags);
  723. }
  724. /*
  725. * Completely zero out a block device with the help of bdrv_pwrite_zeroes.
  726. * The operation is sped up by checking the block status and only writing
  727. * zeroes to the device if they currently do not return zeroes. Optional
  728. * flags are passed through to bdrv_pwrite_zeroes (e.g. BDRV_REQ_MAY_UNMAP,
  729. * BDRV_REQ_FUA).
  730. *
  731. * Returns < 0 on error, 0 on success. For error codes see bdrv_write().
  732. */
  733. int bdrv_make_zero(BdrvChild *child, BdrvRequestFlags flags)
  734. {
  735. int ret;
  736. int64_t target_size, bytes, offset = 0;
  737. BlockDriverState *bs = child->bs;
  738. target_size = bdrv_getlength(bs);
  739. if (target_size < 0) {
  740. return target_size;
  741. }
  742. for (;;) {
  743. bytes = MIN(target_size - offset, BDRV_REQUEST_MAX_BYTES);
  744. if (bytes <= 0) {
  745. return 0;
  746. }
  747. ret = bdrv_block_status(bs, offset, bytes, &bytes, NULL, NULL);
  748. if (ret < 0) {
  749. return ret;
  750. }
  751. if (ret & BDRV_BLOCK_ZERO) {
  752. offset += bytes;
  753. continue;
  754. }
  755. ret = bdrv_pwrite_zeroes(child, offset, bytes, flags);
  756. if (ret < 0) {
  757. return ret;
  758. }
  759. offset += bytes;
  760. }
  761. }
  762. int bdrv_preadv(BdrvChild *child, int64_t offset, QEMUIOVector *qiov)
  763. {
  764. int ret;
  765. ret = bdrv_prwv_co(child, offset, qiov, false, 0);
  766. if (ret < 0) {
  767. return ret;
  768. }
  769. return qiov->size;
  770. }
  771. /* See bdrv_pwrite() for the return codes */
  772. int bdrv_pread(BdrvChild *child, int64_t offset, void *buf, int bytes)
  773. {
  774. QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, bytes);
  775. if (bytes < 0) {
  776. return -EINVAL;
  777. }
  778. return bdrv_preadv(child, offset, &qiov);
  779. }
  780. int bdrv_pwritev(BdrvChild *child, int64_t offset, QEMUIOVector *qiov)
  781. {
  782. int ret;
  783. ret = bdrv_prwv_co(child, offset, qiov, true, 0);
  784. if (ret < 0) {
  785. return ret;
  786. }
  787. return qiov->size;
  788. }
  789. /* Return no. of bytes on success or < 0 on error. Important errors are:
  790. -EIO generic I/O error (may happen for all errors)
  791. -ENOMEDIUM No media inserted.
  792. -EINVAL Invalid offset or number of bytes
  793. -EACCES Trying to write a read-only device
  794. */
  795. int bdrv_pwrite(BdrvChild *child, int64_t offset, const void *buf, int bytes)
  796. {
  797. QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, bytes);
  798. if (bytes < 0) {
  799. return -EINVAL;
  800. }
  801. return bdrv_pwritev(child, offset, &qiov);
  802. }
  803. /*
  804. * Writes to the file and ensures that no writes are reordered across this
  805. * request (acts as a barrier)
  806. *
  807. * Returns 0 on success, -errno in error cases.
  808. */
  809. int bdrv_pwrite_sync(BdrvChild *child, int64_t offset,
  810. const void *buf, int count)
  811. {
  812. int ret;
  813. ret = bdrv_pwrite(child, offset, buf, count);
  814. if (ret < 0) {
  815. return ret;
  816. }
  817. ret = bdrv_flush(child->bs);
  818. if (ret < 0) {
  819. return ret;
  820. }
  821. return 0;
  822. }
  823. typedef struct CoroutineIOCompletion {
  824. Coroutine *coroutine;
  825. int ret;
  826. } CoroutineIOCompletion;
  827. static void bdrv_co_io_em_complete(void *opaque, int ret)
  828. {
  829. CoroutineIOCompletion *co = opaque;
  830. co->ret = ret;
  831. aio_co_wake(co->coroutine);
  832. }
  833. static int coroutine_fn bdrv_driver_preadv(BlockDriverState *bs,
  834. uint64_t offset, uint64_t bytes,
  835. QEMUIOVector *qiov, int flags)
  836. {
  837. BlockDriver *drv = bs->drv;
  838. int64_t sector_num;
  839. unsigned int nb_sectors;
  840. assert(!(flags & ~BDRV_REQ_MASK));
  841. assert(!(flags & BDRV_REQ_NO_FALLBACK));
  842. if (!drv) {
  843. return -ENOMEDIUM;
  844. }
  845. if (drv->bdrv_co_preadv) {
  846. return drv->bdrv_co_preadv(bs, offset, bytes, qiov, flags);
  847. }
  848. if (drv->bdrv_aio_preadv) {
  849. BlockAIOCB *acb;
  850. CoroutineIOCompletion co = {
  851. .coroutine = qemu_coroutine_self(),
  852. };
  853. acb = drv->bdrv_aio_preadv(bs, offset, bytes, qiov, flags,
  854. bdrv_co_io_em_complete, &co);
  855. if (acb == NULL) {
  856. return -EIO;
  857. } else {
  858. qemu_coroutine_yield();
  859. return co.ret;
  860. }
  861. }
  862. sector_num = offset >> BDRV_SECTOR_BITS;
  863. nb_sectors = bytes >> BDRV_SECTOR_BITS;
  864. assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
  865. assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
  866. assert(bytes <= BDRV_REQUEST_MAX_BYTES);
  867. assert(drv->bdrv_co_readv);
  868. return drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
  869. }
  870. static int coroutine_fn bdrv_driver_pwritev(BlockDriverState *bs,
  871. uint64_t offset, uint64_t bytes,
  872. QEMUIOVector *qiov, int flags)
  873. {
  874. BlockDriver *drv = bs->drv;
  875. int64_t sector_num;
  876. unsigned int nb_sectors;
  877. int ret;
  878. assert(!(flags & ~BDRV_REQ_MASK));
  879. assert(!(flags & BDRV_REQ_NO_FALLBACK));
  880. if (!drv) {
  881. return -ENOMEDIUM;
  882. }
  883. if (drv->bdrv_co_pwritev) {
  884. ret = drv->bdrv_co_pwritev(bs, offset, bytes, qiov,
  885. flags & bs->supported_write_flags);
  886. flags &= ~bs->supported_write_flags;
  887. goto emulate_flags;
  888. }
  889. if (drv->bdrv_aio_pwritev) {
  890. BlockAIOCB *acb;
  891. CoroutineIOCompletion co = {
  892. .coroutine = qemu_coroutine_self(),
  893. };
  894. acb = drv->bdrv_aio_pwritev(bs, offset, bytes, qiov,
  895. flags & bs->supported_write_flags,
  896. bdrv_co_io_em_complete, &co);
  897. flags &= ~bs->supported_write_flags;
  898. if (acb == NULL) {
  899. ret = -EIO;
  900. } else {
  901. qemu_coroutine_yield();
  902. ret = co.ret;
  903. }
  904. goto emulate_flags;
  905. }
  906. sector_num = offset >> BDRV_SECTOR_BITS;
  907. nb_sectors = bytes >> BDRV_SECTOR_BITS;
  908. assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
  909. assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
  910. assert(bytes <= BDRV_REQUEST_MAX_BYTES);
  911. assert(drv->bdrv_co_writev);
  912. ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov,
  913. flags & bs->supported_write_flags);
  914. flags &= ~bs->supported_write_flags;
  915. emulate_flags:
  916. if (ret == 0 && (flags & BDRV_REQ_FUA)) {
  917. ret = bdrv_co_flush(bs);
  918. }
  919. return ret;
  920. }
  921. static int coroutine_fn
  922. bdrv_driver_pwritev_compressed(BlockDriverState *bs, uint64_t offset,
  923. uint64_t bytes, QEMUIOVector *qiov)
  924. {
  925. BlockDriver *drv = bs->drv;
  926. if (!drv) {
  927. return -ENOMEDIUM;
  928. }
  929. if (!drv->bdrv_co_pwritev_compressed) {
  930. return -ENOTSUP;
  931. }
  932. return drv->bdrv_co_pwritev_compressed(bs, offset, bytes, qiov);
  933. }
  934. static int coroutine_fn bdrv_co_do_copy_on_readv(BdrvChild *child,
  935. int64_t offset, unsigned int bytes, QEMUIOVector *qiov)
  936. {
  937. BlockDriverState *bs = child->bs;
  938. /* Perform I/O through a temporary buffer so that users who scribble over
  939. * their read buffer while the operation is in progress do not end up
  940. * modifying the image file. This is critical for zero-copy guest I/O
  941. * where anything might happen inside guest memory.
  942. */
  943. void *bounce_buffer;
  944. BlockDriver *drv = bs->drv;
  945. QEMUIOVector local_qiov;
  946. int64_t cluster_offset;
  947. int64_t cluster_bytes;
  948. size_t skip_bytes;
  949. int ret;
  950. int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer,
  951. BDRV_REQUEST_MAX_BYTES);
  952. unsigned int progress = 0;
  953. if (!drv) {
  954. return -ENOMEDIUM;
  955. }
  956. /* FIXME We cannot require callers to have write permissions when all they
  957. * are doing is a read request. If we did things right, write permissions
  958. * would be obtained anyway, but internally by the copy-on-read code. As
  959. * long as it is implemented here rather than in a separate filter driver,
  960. * the copy-on-read code doesn't have its own BdrvChild, however, for which
  961. * it could request permissions. Therefore we have to bypass the permission
  962. * system for the moment. */
  963. // assert(child->perm & (BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE));
  964. /* Cover entire cluster so no additional backing file I/O is required when
  965. * allocating cluster in the image file. Note that this value may exceed
  966. * BDRV_REQUEST_MAX_BYTES (even when the original read did not), which
  967. * is one reason we loop rather than doing it all at once.
  968. */
  969. bdrv_round_to_clusters(bs, offset, bytes, &cluster_offset, &cluster_bytes);
  970. skip_bytes = offset - cluster_offset;
  971. trace_bdrv_co_do_copy_on_readv(bs, offset, bytes,
  972. cluster_offset, cluster_bytes);
  973. bounce_buffer = qemu_try_blockalign(bs,
  974. MIN(MIN(max_transfer, cluster_bytes),
  975. MAX_BOUNCE_BUFFER));
  976. if (bounce_buffer == NULL) {
  977. ret = -ENOMEM;
  978. goto err;
  979. }
  980. while (cluster_bytes) {
  981. int64_t pnum;
  982. ret = bdrv_is_allocated(bs, cluster_offset,
  983. MIN(cluster_bytes, max_transfer), &pnum);
  984. if (ret < 0) {
  985. /* Safe to treat errors in querying allocation as if
  986. * unallocated; we'll probably fail again soon on the
  987. * read, but at least that will set a decent errno.
  988. */
  989. pnum = MIN(cluster_bytes, max_transfer);
  990. }
  991. /* Stop at EOF if the image ends in the middle of the cluster */
  992. if (ret == 0 && pnum == 0) {
  993. assert(progress >= bytes);
  994. break;
  995. }
  996. assert(skip_bytes < pnum);
  997. if (ret <= 0) {
  998. /* Must copy-on-read; use the bounce buffer */
  999. pnum = MIN(pnum, MAX_BOUNCE_BUFFER);
  1000. qemu_iovec_init_buf(&local_qiov, bounce_buffer, pnum);
  1001. ret = bdrv_driver_preadv(bs, cluster_offset, pnum,
  1002. &local_qiov, 0);
  1003. if (ret < 0) {
  1004. goto err;
  1005. }
  1006. bdrv_debug_event(bs, BLKDBG_COR_WRITE);
  1007. if (drv->bdrv_co_pwrite_zeroes &&
  1008. buffer_is_zero(bounce_buffer, pnum)) {
  1009. /* FIXME: Should we (perhaps conditionally) be setting
  1010. * BDRV_REQ_MAY_UNMAP, if it will allow for a sparser copy
  1011. * that still correctly reads as zero? */
  1012. ret = bdrv_co_do_pwrite_zeroes(bs, cluster_offset, pnum,
  1013. BDRV_REQ_WRITE_UNCHANGED);
  1014. } else {
  1015. /* This does not change the data on the disk, it is not
  1016. * necessary to flush even in cache=writethrough mode.
  1017. */
  1018. ret = bdrv_driver_pwritev(bs, cluster_offset, pnum,
  1019. &local_qiov,
  1020. BDRV_REQ_WRITE_UNCHANGED);
  1021. }
  1022. if (ret < 0) {
  1023. /* It might be okay to ignore write errors for guest
  1024. * requests. If this is a deliberate copy-on-read
  1025. * then we don't want to ignore the error. Simply
  1026. * report it in all cases.
  1027. */
  1028. goto err;
  1029. }
  1030. qemu_iovec_from_buf(qiov, progress, bounce_buffer + skip_bytes,
  1031. pnum - skip_bytes);
  1032. } else {
  1033. /* Read directly into the destination */
  1034. qemu_iovec_init(&local_qiov, qiov->niov);
  1035. qemu_iovec_concat(&local_qiov, qiov, progress, pnum - skip_bytes);
  1036. ret = bdrv_driver_preadv(bs, offset + progress, local_qiov.size,
  1037. &local_qiov, 0);
  1038. qemu_iovec_destroy(&local_qiov);
  1039. if (ret < 0) {
  1040. goto err;
  1041. }
  1042. }
  1043. cluster_offset += pnum;
  1044. cluster_bytes -= pnum;
  1045. progress += pnum - skip_bytes;
  1046. skip_bytes = 0;
  1047. }
  1048. ret = 0;
  1049. err:
  1050. qemu_vfree(bounce_buffer);
  1051. return ret;
  1052. }
  1053. /*
  1054. * Forwards an already correctly aligned request to the BlockDriver. This
  1055. * handles copy on read, zeroing after EOF, and fragmentation of large
  1056. * reads; any other features must be implemented by the caller.
  1057. */
  1058. static int coroutine_fn bdrv_aligned_preadv(BdrvChild *child,
  1059. BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
  1060. int64_t align, QEMUIOVector *qiov, int flags)
  1061. {
  1062. BlockDriverState *bs = child->bs;
  1063. int64_t total_bytes, max_bytes;
  1064. int ret = 0;
  1065. uint64_t bytes_remaining = bytes;
  1066. int max_transfer;
  1067. assert(is_power_of_2(align));
  1068. assert((offset & (align - 1)) == 0);
  1069. assert((bytes & (align - 1)) == 0);
  1070. assert(!qiov || bytes == qiov->size);
  1071. assert((bs->open_flags & BDRV_O_NO_IO) == 0);
  1072. max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX),
  1073. align);
  1074. /* TODO: We would need a per-BDS .supported_read_flags and
  1075. * potential fallback support, if we ever implement any read flags
  1076. * to pass through to drivers. For now, there aren't any
  1077. * passthrough flags. */
  1078. assert(!(flags & ~(BDRV_REQ_NO_SERIALISING | BDRV_REQ_COPY_ON_READ)));
  1079. /* Handle Copy on Read and associated serialisation */
  1080. if (flags & BDRV_REQ_COPY_ON_READ) {
  1081. /* If we touch the same cluster it counts as an overlap. This
  1082. * guarantees that allocating writes will be serialized and not race
  1083. * with each other for the same cluster. For example, in copy-on-read
  1084. * it ensures that the CoR read and write operations are atomic and
  1085. * guest writes cannot interleave between them. */
  1086. mark_request_serialising(req, bdrv_get_cluster_size(bs));
  1087. }
  1088. /* BDRV_REQ_SERIALISING is only for write operation */
  1089. assert(!(flags & BDRV_REQ_SERIALISING));
  1090. if (!(flags & BDRV_REQ_NO_SERIALISING)) {
  1091. wait_serialising_requests(req);
  1092. }
  1093. if (flags & BDRV_REQ_COPY_ON_READ) {
  1094. int64_t pnum;
  1095. ret = bdrv_is_allocated(bs, offset, bytes, &pnum);
  1096. if (ret < 0) {
  1097. goto out;
  1098. }
  1099. if (!ret || pnum != bytes) {
  1100. ret = bdrv_co_do_copy_on_readv(child, offset, bytes, qiov);
  1101. goto out;
  1102. }
  1103. }
  1104. /* Forward the request to the BlockDriver, possibly fragmenting it */
  1105. total_bytes = bdrv_getlength(bs);
  1106. if (total_bytes < 0) {
  1107. ret = total_bytes;
  1108. goto out;
  1109. }
  1110. max_bytes = ROUND_UP(MAX(0, total_bytes - offset), align);
  1111. if (bytes <= max_bytes && bytes <= max_transfer) {
  1112. ret = bdrv_driver_preadv(bs, offset, bytes, qiov, 0);
  1113. goto out;
  1114. }
  1115. while (bytes_remaining) {
  1116. int num;
  1117. if (max_bytes) {
  1118. QEMUIOVector local_qiov;
  1119. num = MIN(bytes_remaining, MIN(max_bytes, max_transfer));
  1120. assert(num);
  1121. qemu_iovec_init(&local_qiov, qiov->niov);
  1122. qemu_iovec_concat(&local_qiov, qiov, bytes - bytes_remaining, num);
  1123. ret = bdrv_driver_preadv(bs, offset + bytes - bytes_remaining,
  1124. num, &local_qiov, 0);
  1125. max_bytes -= num;
  1126. qemu_iovec_destroy(&local_qiov);
  1127. } else {
  1128. num = bytes_remaining;
  1129. ret = qemu_iovec_memset(qiov, bytes - bytes_remaining, 0,
  1130. bytes_remaining);
  1131. }
  1132. if (ret < 0) {
  1133. goto out;
  1134. }
  1135. bytes_remaining -= num;
  1136. }
  1137. out:
  1138. return ret < 0 ? ret : 0;
  1139. }
  1140. /*
  1141. * Handle a read request in coroutine context
  1142. */
  1143. int coroutine_fn bdrv_co_preadv(BdrvChild *child,
  1144. int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
  1145. BdrvRequestFlags flags)
  1146. {
  1147. BlockDriverState *bs = child->bs;
  1148. BlockDriver *drv = bs->drv;
  1149. BdrvTrackedRequest req;
  1150. uint64_t align = bs->bl.request_alignment;
  1151. uint8_t *head_buf = NULL;
  1152. uint8_t *tail_buf = NULL;
  1153. QEMUIOVector local_qiov;
  1154. bool use_local_qiov = false;
  1155. int ret;
  1156. trace_bdrv_co_preadv(child->bs, offset, bytes, flags);
  1157. if (!drv) {
  1158. return -ENOMEDIUM;
  1159. }
  1160. ret = bdrv_check_byte_request(bs, offset, bytes);
  1161. if (ret < 0) {
  1162. return ret;
  1163. }
  1164. bdrv_inc_in_flight(bs);
  1165. /* Don't do copy-on-read if we read data before write operation */
  1166. if (atomic_read(&bs->copy_on_read) && !(flags & BDRV_REQ_NO_SERIALISING)) {
  1167. flags |= BDRV_REQ_COPY_ON_READ;
  1168. }
  1169. /* Align read if necessary by padding qiov */
  1170. if (offset & (align - 1)) {
  1171. head_buf = qemu_blockalign(bs, align);
  1172. qemu_iovec_init(&local_qiov, qiov->niov + 2);
  1173. qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
  1174. qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
  1175. use_local_qiov = true;
  1176. bytes += offset & (align - 1);
  1177. offset = offset & ~(align - 1);
  1178. }
  1179. if ((offset + bytes) & (align - 1)) {
  1180. if (!use_local_qiov) {
  1181. qemu_iovec_init(&local_qiov, qiov->niov + 1);
  1182. qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
  1183. use_local_qiov = true;
  1184. }
  1185. tail_buf = qemu_blockalign(bs, align);
  1186. qemu_iovec_add(&local_qiov, tail_buf,
  1187. align - ((offset + bytes) & (align - 1)));
  1188. bytes = ROUND_UP(bytes, align);
  1189. }
  1190. tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_READ);
  1191. ret = bdrv_aligned_preadv(child, &req, offset, bytes, align,
  1192. use_local_qiov ? &local_qiov : qiov,
  1193. flags);
  1194. tracked_request_end(&req);
  1195. bdrv_dec_in_flight(bs);
  1196. if (use_local_qiov) {
  1197. qemu_iovec_destroy(&local_qiov);
  1198. qemu_vfree(head_buf);
  1199. qemu_vfree(tail_buf);
  1200. }
  1201. return ret;
  1202. }
  1203. static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
  1204. int64_t offset, int bytes, BdrvRequestFlags flags)
  1205. {
  1206. BlockDriver *drv = bs->drv;
  1207. QEMUIOVector qiov;
  1208. void *buf = NULL;
  1209. int ret = 0;
  1210. bool need_flush = false;
  1211. int head = 0;
  1212. int tail = 0;
  1213. int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_pwrite_zeroes, INT_MAX);
  1214. int alignment = MAX(bs->bl.pwrite_zeroes_alignment,
  1215. bs->bl.request_alignment);
  1216. int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer, MAX_BOUNCE_BUFFER);
  1217. if (!drv) {
  1218. return -ENOMEDIUM;
  1219. }
  1220. if ((flags & ~bs->supported_zero_flags) & BDRV_REQ_NO_FALLBACK) {
  1221. return -ENOTSUP;
  1222. }
  1223. assert(alignment % bs->bl.request_alignment == 0);
  1224. head = offset % alignment;
  1225. tail = (offset + bytes) % alignment;
  1226. max_write_zeroes = QEMU_ALIGN_DOWN(max_write_zeroes, alignment);
  1227. assert(max_write_zeroes >= bs->bl.request_alignment);
  1228. while (bytes > 0 && !ret) {
  1229. int num = bytes;
  1230. /* Align request. Block drivers can expect the "bulk" of the request
  1231. * to be aligned, and that unaligned requests do not cross cluster
  1232. * boundaries.
  1233. */
  1234. if (head) {
  1235. /* Make a small request up to the first aligned sector. For
  1236. * convenience, limit this request to max_transfer even if
  1237. * we don't need to fall back to writes. */
  1238. num = MIN(MIN(bytes, max_transfer), alignment - head);
  1239. head = (head + num) % alignment;
  1240. assert(num < max_write_zeroes);
  1241. } else if (tail && num > alignment) {
  1242. /* Shorten the request to the last aligned sector. */
  1243. num -= tail;
  1244. }
  1245. /* limit request size */
  1246. if (num > max_write_zeroes) {
  1247. num = max_write_zeroes;
  1248. }
  1249. ret = -ENOTSUP;
  1250. /* First try the efficient write zeroes operation */
  1251. if (drv->bdrv_co_pwrite_zeroes) {
  1252. ret = drv->bdrv_co_pwrite_zeroes(bs, offset, num,
  1253. flags & bs->supported_zero_flags);
  1254. if (ret != -ENOTSUP && (flags & BDRV_REQ_FUA) &&
  1255. !(bs->supported_zero_flags & BDRV_REQ_FUA)) {
  1256. need_flush = true;
  1257. }
  1258. } else {
  1259. assert(!bs->supported_zero_flags);
  1260. }
  1261. if (ret < 0 && !(flags & BDRV_REQ_NO_FALLBACK)) {
  1262. /* Fall back to bounce buffer if write zeroes is unsupported */
  1263. BdrvRequestFlags write_flags = flags & ~BDRV_REQ_ZERO_WRITE;
  1264. if ((flags & BDRV_REQ_FUA) &&
  1265. !(bs->supported_write_flags & BDRV_REQ_FUA)) {
  1266. /* No need for bdrv_driver_pwrite() to do a fallback
  1267. * flush on each chunk; use just one at the end */
  1268. write_flags &= ~BDRV_REQ_FUA;
  1269. need_flush = true;
  1270. }
  1271. num = MIN(num, max_transfer);
  1272. if (buf == NULL) {
  1273. buf = qemu_try_blockalign0(bs, num);
  1274. if (buf == NULL) {
  1275. ret = -ENOMEM;
  1276. goto fail;
  1277. }
  1278. }
  1279. qemu_iovec_init_buf(&qiov, buf, num);
  1280. ret = bdrv_driver_pwritev(bs, offset, num, &qiov, write_flags);
  1281. /* Keep bounce buffer around if it is big enough for all
  1282. * all future requests.
  1283. */
  1284. if (num < max_transfer) {
  1285. qemu_vfree(buf);
  1286. buf = NULL;
  1287. }
  1288. }
  1289. offset += num;
  1290. bytes -= num;
  1291. }
  1292. fail:
  1293. if (ret == 0 && need_flush) {
  1294. ret = bdrv_co_flush(bs);
  1295. }
  1296. qemu_vfree(buf);
  1297. return ret;
  1298. }
  1299. static inline int coroutine_fn
  1300. bdrv_co_write_req_prepare(BdrvChild *child, int64_t offset, uint64_t bytes,
  1301. BdrvTrackedRequest *req, int flags)
  1302. {
  1303. BlockDriverState *bs = child->bs;
  1304. bool waited;
  1305. int64_t end_sector = DIV_ROUND_UP(offset + bytes, BDRV_SECTOR_SIZE);
  1306. if (bs->read_only) {
  1307. return -EPERM;
  1308. }
  1309. /* BDRV_REQ_NO_SERIALISING is only for read operation */
  1310. assert(!(flags & BDRV_REQ_NO_SERIALISING));
  1311. assert(!(bs->open_flags & BDRV_O_INACTIVE));
  1312. assert((bs->open_flags & BDRV_O_NO_IO) == 0);
  1313. assert(!(flags & ~BDRV_REQ_MASK));
  1314. if (flags & BDRV_REQ_SERIALISING) {
  1315. mark_request_serialising(req, bdrv_get_cluster_size(bs));
  1316. }
  1317. waited = wait_serialising_requests(req);
  1318. assert(!waited || !req->serialising ||
  1319. is_request_serialising_and_aligned(req));
  1320. assert(req->overlap_offset <= offset);
  1321. assert(offset + bytes <= req->overlap_offset + req->overlap_bytes);
  1322. assert(end_sector <= bs->total_sectors || child->perm & BLK_PERM_RESIZE);
  1323. switch (req->type) {
  1324. case BDRV_TRACKED_WRITE:
  1325. case BDRV_TRACKED_DISCARD:
  1326. if (flags & BDRV_REQ_WRITE_UNCHANGED) {
  1327. assert(child->perm & (BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE));
  1328. } else {
  1329. assert(child->perm & BLK_PERM_WRITE);
  1330. }
  1331. return notifier_with_return_list_notify(&bs->before_write_notifiers,
  1332. req);
  1333. case BDRV_TRACKED_TRUNCATE:
  1334. assert(child->perm & BLK_PERM_RESIZE);
  1335. return 0;
  1336. default:
  1337. abort();
  1338. }
  1339. }
  1340. static inline void coroutine_fn
  1341. bdrv_co_write_req_finish(BdrvChild *child, int64_t offset, uint64_t bytes,
  1342. BdrvTrackedRequest *req, int ret)
  1343. {
  1344. int64_t end_sector = DIV_ROUND_UP(offset + bytes, BDRV_SECTOR_SIZE);
  1345. BlockDriverState *bs = child->bs;
  1346. atomic_inc(&bs->write_gen);
  1347. /*
  1348. * Discard cannot extend the image, but in error handling cases, such as
  1349. * when reverting a qcow2 cluster allocation, the discarded range can pass
  1350. * the end of image file, so we cannot assert about BDRV_TRACKED_DISCARD
  1351. * here. Instead, just skip it, since semantically a discard request
  1352. * beyond EOF cannot expand the image anyway.
  1353. */
  1354. if (ret == 0 &&
  1355. (req->type == BDRV_TRACKED_TRUNCATE ||
  1356. end_sector > bs->total_sectors) &&
  1357. req->type != BDRV_TRACKED_DISCARD) {
  1358. bs->total_sectors = end_sector;
  1359. bdrv_parent_cb_resize(bs);
  1360. bdrv_dirty_bitmap_truncate(bs, end_sector << BDRV_SECTOR_BITS);
  1361. }
  1362. if (req->bytes) {
  1363. switch (req->type) {
  1364. case BDRV_TRACKED_WRITE:
  1365. stat64_max(&bs->wr_highest_offset, offset + bytes);
  1366. /* fall through, to set dirty bits */
  1367. case BDRV_TRACKED_DISCARD:
  1368. bdrv_set_dirty(bs, offset, bytes);
  1369. break;
  1370. default:
  1371. break;
  1372. }
  1373. }
  1374. }
  1375. /*
  1376. * Forwards an already correctly aligned write request to the BlockDriver,
  1377. * after possibly fragmenting it.
  1378. */
  1379. static int coroutine_fn bdrv_aligned_pwritev(BdrvChild *child,
  1380. BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
  1381. int64_t align, QEMUIOVector *qiov, int flags)
  1382. {
  1383. BlockDriverState *bs = child->bs;
  1384. BlockDriver *drv = bs->drv;
  1385. int ret;
  1386. uint64_t bytes_remaining = bytes;
  1387. int max_transfer;
  1388. if (!drv) {
  1389. return -ENOMEDIUM;
  1390. }
  1391. if (bdrv_has_readonly_bitmaps(bs)) {
  1392. return -EPERM;
  1393. }
  1394. assert(is_power_of_2(align));
  1395. assert((offset & (align - 1)) == 0);
  1396. assert((bytes & (align - 1)) == 0);
  1397. assert(!qiov || bytes == qiov->size);
  1398. max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX),
  1399. align);
  1400. ret = bdrv_co_write_req_prepare(child, offset, bytes, req, flags);
  1401. if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF &&
  1402. !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_pwrite_zeroes &&
  1403. qemu_iovec_is_zero(qiov)) {
  1404. flags |= BDRV_REQ_ZERO_WRITE;
  1405. if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) {
  1406. flags |= BDRV_REQ_MAY_UNMAP;
  1407. }
  1408. }
  1409. if (ret < 0) {
  1410. /* Do nothing, write notifier decided to fail this request */
  1411. } else if (flags & BDRV_REQ_ZERO_WRITE) {
  1412. bdrv_debug_event(bs, BLKDBG_PWRITEV_ZERO);
  1413. ret = bdrv_co_do_pwrite_zeroes(bs, offset, bytes, flags);
  1414. } else if (flags & BDRV_REQ_WRITE_COMPRESSED) {
  1415. ret = bdrv_driver_pwritev_compressed(bs, offset, bytes, qiov);
  1416. } else if (bytes <= max_transfer) {
  1417. bdrv_debug_event(bs, BLKDBG_PWRITEV);
  1418. ret = bdrv_driver_pwritev(bs, offset, bytes, qiov, flags);
  1419. } else {
  1420. bdrv_debug_event(bs, BLKDBG_PWRITEV);
  1421. while (bytes_remaining) {
  1422. int num = MIN(bytes_remaining, max_transfer);
  1423. QEMUIOVector local_qiov;
  1424. int local_flags = flags;
  1425. assert(num);
  1426. if (num < bytes_remaining && (flags & BDRV_REQ_FUA) &&
  1427. !(bs->supported_write_flags & BDRV_REQ_FUA)) {
  1428. /* If FUA is going to be emulated by flush, we only
  1429. * need to flush on the last iteration */
  1430. local_flags &= ~BDRV_REQ_FUA;
  1431. }
  1432. qemu_iovec_init(&local_qiov, qiov->niov);
  1433. qemu_iovec_concat(&local_qiov, qiov, bytes - bytes_remaining, num);
  1434. ret = bdrv_driver_pwritev(bs, offset + bytes - bytes_remaining,
  1435. num, &local_qiov, local_flags);
  1436. qemu_iovec_destroy(&local_qiov);
  1437. if (ret < 0) {
  1438. break;
  1439. }
  1440. bytes_remaining -= num;
  1441. }
  1442. }
  1443. bdrv_debug_event(bs, BLKDBG_PWRITEV_DONE);
  1444. if (ret >= 0) {
  1445. ret = 0;
  1446. }
  1447. bdrv_co_write_req_finish(child, offset, bytes, req, ret);
  1448. return ret;
  1449. }
  1450. static int coroutine_fn bdrv_co_do_zero_pwritev(BdrvChild *child,
  1451. int64_t offset,
  1452. unsigned int bytes,
  1453. BdrvRequestFlags flags,
  1454. BdrvTrackedRequest *req)
  1455. {
  1456. BlockDriverState *bs = child->bs;
  1457. uint8_t *buf = NULL;
  1458. QEMUIOVector local_qiov;
  1459. uint64_t align = bs->bl.request_alignment;
  1460. unsigned int head_padding_bytes, tail_padding_bytes;
  1461. int ret = 0;
  1462. head_padding_bytes = offset & (align - 1);
  1463. tail_padding_bytes = (align - (offset + bytes)) & (align - 1);
  1464. assert(flags & BDRV_REQ_ZERO_WRITE);
  1465. if (head_padding_bytes || tail_padding_bytes) {
  1466. buf = qemu_blockalign(bs, align);
  1467. qemu_iovec_init_buf(&local_qiov, buf, align);
  1468. }
  1469. if (head_padding_bytes) {
  1470. uint64_t zero_bytes = MIN(bytes, align - head_padding_bytes);
  1471. /* RMW the unaligned part before head. */
  1472. mark_request_serialising(req, align);
  1473. wait_serialising_requests(req);
  1474. bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD);
  1475. ret = bdrv_aligned_preadv(child, req, offset & ~(align - 1), align,
  1476. align, &local_qiov, 0);
  1477. if (ret < 0) {
  1478. goto fail;
  1479. }
  1480. bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
  1481. memset(buf + head_padding_bytes, 0, zero_bytes);
  1482. ret = bdrv_aligned_pwritev(child, req, offset & ~(align - 1), align,
  1483. align, &local_qiov,
  1484. flags & ~BDRV_REQ_ZERO_WRITE);
  1485. if (ret < 0) {
  1486. goto fail;
  1487. }
  1488. offset += zero_bytes;
  1489. bytes -= zero_bytes;
  1490. }
  1491. assert(!bytes || (offset & (align - 1)) == 0);
  1492. if (bytes >= align) {
  1493. /* Write the aligned part in the middle. */
  1494. uint64_t aligned_bytes = bytes & ~(align - 1);
  1495. ret = bdrv_aligned_pwritev(child, req, offset, aligned_bytes, align,
  1496. NULL, flags);
  1497. if (ret < 0) {
  1498. goto fail;
  1499. }
  1500. bytes -= aligned_bytes;
  1501. offset += aligned_bytes;
  1502. }
  1503. assert(!bytes || (offset & (align - 1)) == 0);
  1504. if (bytes) {
  1505. assert(align == tail_padding_bytes + bytes);
  1506. /* RMW the unaligned part after tail. */
  1507. mark_request_serialising(req, align);
  1508. wait_serialising_requests(req);
  1509. bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL);
  1510. ret = bdrv_aligned_preadv(child, req, offset, align,
  1511. align, &local_qiov, 0);
  1512. if (ret < 0) {
  1513. goto fail;
  1514. }
  1515. bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
  1516. memset(buf, 0, bytes);
  1517. ret = bdrv_aligned_pwritev(child, req, offset, align, align,
  1518. &local_qiov, flags & ~BDRV_REQ_ZERO_WRITE);
  1519. }
  1520. fail:
  1521. qemu_vfree(buf);
  1522. return ret;
  1523. }
  1524. /*
  1525. * Handle a write request in coroutine context
  1526. */
  1527. int coroutine_fn bdrv_co_pwritev(BdrvChild *child,
  1528. int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
  1529. BdrvRequestFlags flags)
  1530. {
  1531. BlockDriverState *bs = child->bs;
  1532. BdrvTrackedRequest req;
  1533. uint64_t align = bs->bl.request_alignment;
  1534. uint8_t *head_buf = NULL;
  1535. uint8_t *tail_buf = NULL;
  1536. QEMUIOVector local_qiov;
  1537. bool use_local_qiov = false;
  1538. int ret;
  1539. trace_bdrv_co_pwritev(child->bs, offset, bytes, flags);
  1540. if (!bs->drv) {
  1541. return -ENOMEDIUM;
  1542. }
  1543. ret = bdrv_check_byte_request(bs, offset, bytes);
  1544. if (ret < 0) {
  1545. return ret;
  1546. }
  1547. bdrv_inc_in_flight(bs);
  1548. /*
  1549. * Align write if necessary by performing a read-modify-write cycle.
  1550. * Pad qiov with the read parts and be sure to have a tracked request not
  1551. * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle.
  1552. */
  1553. tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_WRITE);
  1554. if (flags & BDRV_REQ_ZERO_WRITE) {
  1555. ret = bdrv_co_do_zero_pwritev(child, offset, bytes, flags, &req);
  1556. goto out;
  1557. }
  1558. if (offset & (align - 1)) {
  1559. QEMUIOVector head_qiov;
  1560. mark_request_serialising(&req, align);
  1561. wait_serialising_requests(&req);
  1562. head_buf = qemu_blockalign(bs, align);
  1563. qemu_iovec_init_buf(&head_qiov, head_buf, align);
  1564. bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD);
  1565. ret = bdrv_aligned_preadv(child, &req, offset & ~(align - 1), align,
  1566. align, &head_qiov, 0);
  1567. if (ret < 0) {
  1568. goto fail;
  1569. }
  1570. bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
  1571. qemu_iovec_init(&local_qiov, qiov->niov + 2);
  1572. qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
  1573. qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
  1574. use_local_qiov = true;
  1575. bytes += offset & (align - 1);
  1576. offset = offset & ~(align - 1);
  1577. /* We have read the tail already if the request is smaller
  1578. * than one aligned block.
  1579. */
  1580. if (bytes < align) {
  1581. qemu_iovec_add(&local_qiov, head_buf + bytes, align - bytes);
  1582. bytes = align;
  1583. }
  1584. }
  1585. if ((offset + bytes) & (align - 1)) {
  1586. QEMUIOVector tail_qiov;
  1587. size_t tail_bytes;
  1588. bool waited;
  1589. mark_request_serialising(&req, align);
  1590. waited = wait_serialising_requests(&req);
  1591. assert(!waited || !use_local_qiov);
  1592. tail_buf = qemu_blockalign(bs, align);
  1593. qemu_iovec_init_buf(&tail_qiov, tail_buf, align);
  1594. bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL);
  1595. ret = bdrv_aligned_preadv(child, &req, (offset + bytes) & ~(align - 1),
  1596. align, align, &tail_qiov, 0);
  1597. if (ret < 0) {
  1598. goto fail;
  1599. }
  1600. bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
  1601. if (!use_local_qiov) {
  1602. qemu_iovec_init(&local_qiov, qiov->niov + 1);
  1603. qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
  1604. use_local_qiov = true;
  1605. }
  1606. tail_bytes = (offset + bytes) & (align - 1);
  1607. qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes);
  1608. bytes = ROUND_UP(bytes, align);
  1609. }
  1610. ret = bdrv_aligned_pwritev(child, &req, offset, bytes, align,
  1611. use_local_qiov ? &local_qiov : qiov,
  1612. flags);
  1613. fail:
  1614. if (use_local_qiov) {
  1615. qemu_iovec_destroy(&local_qiov);
  1616. }
  1617. qemu_vfree(head_buf);
  1618. qemu_vfree(tail_buf);
  1619. out:
  1620. tracked_request_end(&req);
  1621. bdrv_dec_in_flight(bs);
  1622. return ret;
  1623. }
  1624. int coroutine_fn bdrv_co_pwrite_zeroes(BdrvChild *child, int64_t offset,
  1625. int bytes, BdrvRequestFlags flags)
  1626. {
  1627. trace_bdrv_co_pwrite_zeroes(child->bs, offset, bytes, flags);
  1628. if (!(child->bs->open_flags & BDRV_O_UNMAP)) {
  1629. flags &= ~BDRV_REQ_MAY_UNMAP;
  1630. }
  1631. return bdrv_co_pwritev(child, offset, bytes, NULL,
  1632. BDRV_REQ_ZERO_WRITE | flags);
  1633. }
  1634. /*
  1635. * Flush ALL BDSes regardless of if they are reachable via a BlkBackend or not.
  1636. */
  1637. int bdrv_flush_all(void)
  1638. {
  1639. BdrvNextIterator it;
  1640. BlockDriverState *bs = NULL;
  1641. int result = 0;
  1642. for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
  1643. AioContext *aio_context = bdrv_get_aio_context(bs);
  1644. int ret;
  1645. aio_context_acquire(aio_context);
  1646. ret = bdrv_flush(bs);
  1647. if (ret < 0 && !result) {
  1648. result = ret;
  1649. }
  1650. aio_context_release(aio_context);
  1651. }
  1652. return result;
  1653. }
  1654. typedef struct BdrvCoBlockStatusData {
  1655. BlockDriverState *bs;
  1656. BlockDriverState *base;
  1657. bool want_zero;
  1658. int64_t offset;
  1659. int64_t bytes;
  1660. int64_t *pnum;
  1661. int64_t *map;
  1662. BlockDriverState **file;
  1663. int ret;
  1664. bool done;
  1665. } BdrvCoBlockStatusData;
  1666. int coroutine_fn bdrv_co_block_status_from_file(BlockDriverState *bs,
  1667. bool want_zero,
  1668. int64_t offset,
  1669. int64_t bytes,
  1670. int64_t *pnum,
  1671. int64_t *map,
  1672. BlockDriverState **file)
  1673. {
  1674. assert(bs->file && bs->file->bs);
  1675. *pnum = bytes;
  1676. *map = offset;
  1677. *file = bs->file->bs;
  1678. return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID;
  1679. }
  1680. int coroutine_fn bdrv_co_block_status_from_backing(BlockDriverState *bs,
  1681. bool want_zero,
  1682. int64_t offset,
  1683. int64_t bytes,
  1684. int64_t *pnum,
  1685. int64_t *map,
  1686. BlockDriverState **file)
  1687. {
  1688. assert(bs->backing && bs->backing->bs);
  1689. *pnum = bytes;
  1690. *map = offset;
  1691. *file = bs->backing->bs;
  1692. return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID;
  1693. }
  1694. /*
  1695. * Returns the allocation status of the specified sectors.
  1696. * Drivers not implementing the functionality are assumed to not support
  1697. * backing files, hence all their sectors are reported as allocated.
  1698. *
  1699. * If 'want_zero' is true, the caller is querying for mapping
  1700. * purposes, with a focus on valid BDRV_BLOCK_OFFSET_VALID, _DATA, and
  1701. * _ZERO where possible; otherwise, the result favors larger 'pnum',
  1702. * with a focus on accurate BDRV_BLOCK_ALLOCATED.
  1703. *
  1704. * If 'offset' is beyond the end of the disk image the return value is
  1705. * BDRV_BLOCK_EOF and 'pnum' is set to 0.
  1706. *
  1707. * 'bytes' is the max value 'pnum' should be set to. If bytes goes
  1708. * beyond the end of the disk image it will be clamped; if 'pnum' is set to
  1709. * the end of the image, then the returned value will include BDRV_BLOCK_EOF.
  1710. *
  1711. * 'pnum' is set to the number of bytes (including and immediately
  1712. * following the specified offset) that are easily known to be in the
  1713. * same allocated/unallocated state. Note that a second call starting
  1714. * at the original offset plus returned pnum may have the same status.
  1715. * The returned value is non-zero on success except at end-of-file.
  1716. *
  1717. * Returns negative errno on failure. Otherwise, if the
  1718. * BDRV_BLOCK_OFFSET_VALID bit is set, 'map' and 'file' (if non-NULL) are
  1719. * set to the host mapping and BDS corresponding to the guest offset.
  1720. */
  1721. static int coroutine_fn bdrv_co_block_status(BlockDriverState *bs,
  1722. bool want_zero,
  1723. int64_t offset, int64_t bytes,
  1724. int64_t *pnum, int64_t *map,
  1725. BlockDriverState **file)
  1726. {
  1727. int64_t total_size;
  1728. int64_t n; /* bytes */
  1729. int ret;
  1730. int64_t local_map = 0;
  1731. BlockDriverState *local_file = NULL;
  1732. int64_t aligned_offset, aligned_bytes;
  1733. uint32_t align;
  1734. assert(pnum);
  1735. *pnum = 0;
  1736. total_size = bdrv_getlength(bs);
  1737. if (total_size < 0) {
  1738. ret = total_size;
  1739. goto early_out;
  1740. }
  1741. if (offset >= total_size) {
  1742. ret = BDRV_BLOCK_EOF;
  1743. goto early_out;
  1744. }
  1745. if (!bytes) {
  1746. ret = 0;
  1747. goto early_out;
  1748. }
  1749. n = total_size - offset;
  1750. if (n < bytes) {
  1751. bytes = n;
  1752. }
  1753. /* Must be non-NULL or bdrv_getlength() would have failed */
  1754. assert(bs->drv);
  1755. if (!bs->drv->bdrv_co_block_status) {
  1756. *pnum = bytes;
  1757. ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED;
  1758. if (offset + bytes == total_size) {
  1759. ret |= BDRV_BLOCK_EOF;
  1760. }
  1761. if (bs->drv->protocol_name) {
  1762. ret |= BDRV_BLOCK_OFFSET_VALID;
  1763. local_map = offset;
  1764. local_file = bs;
  1765. }
  1766. goto early_out;
  1767. }
  1768. bdrv_inc_in_flight(bs);
  1769. /* Round out to request_alignment boundaries */
  1770. align = bs->bl.request_alignment;
  1771. aligned_offset = QEMU_ALIGN_DOWN(offset, align);
  1772. aligned_bytes = ROUND_UP(offset + bytes, align) - aligned_offset;
  1773. ret = bs->drv->bdrv_co_block_status(bs, want_zero, aligned_offset,
  1774. aligned_bytes, pnum, &local_map,
  1775. &local_file);
  1776. if (ret < 0) {
  1777. *pnum = 0;
  1778. goto out;
  1779. }
  1780. /*
  1781. * The driver's result must be a non-zero multiple of request_alignment.
  1782. * Clamp pnum and adjust map to original request.
  1783. */
  1784. assert(*pnum && QEMU_IS_ALIGNED(*pnum, align) &&
  1785. align > offset - aligned_offset);
  1786. *pnum -= offset - aligned_offset;
  1787. if (*pnum > bytes) {
  1788. *pnum = bytes;
  1789. }
  1790. if (ret & BDRV_BLOCK_OFFSET_VALID) {
  1791. local_map += offset - aligned_offset;
  1792. }
  1793. if (ret & BDRV_BLOCK_RAW) {
  1794. assert(ret & BDRV_BLOCK_OFFSET_VALID && local_file);
  1795. ret = bdrv_co_block_status(local_file, want_zero, local_map,
  1796. *pnum, pnum, &local_map, &local_file);
  1797. goto out;
  1798. }
  1799. if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) {
  1800. ret |= BDRV_BLOCK_ALLOCATED;
  1801. } else if (want_zero) {
  1802. if (bdrv_unallocated_blocks_are_zero(bs)) {
  1803. ret |= BDRV_BLOCK_ZERO;
  1804. } else if (bs->backing) {
  1805. BlockDriverState *bs2 = bs->backing->bs;
  1806. int64_t size2 = bdrv_getlength(bs2);
  1807. if (size2 >= 0 && offset >= size2) {
  1808. ret |= BDRV_BLOCK_ZERO;
  1809. }
  1810. }
  1811. }
  1812. if (want_zero && local_file && local_file != bs &&
  1813. (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
  1814. (ret & BDRV_BLOCK_OFFSET_VALID)) {
  1815. int64_t file_pnum;
  1816. int ret2;
  1817. ret2 = bdrv_co_block_status(local_file, want_zero, local_map,
  1818. *pnum, &file_pnum, NULL, NULL);
  1819. if (ret2 >= 0) {
  1820. /* Ignore errors. This is just providing extra information, it
  1821. * is useful but not necessary.
  1822. */
  1823. if (ret2 & BDRV_BLOCK_EOF &&
  1824. (!file_pnum || ret2 & BDRV_BLOCK_ZERO)) {
  1825. /*
  1826. * It is valid for the format block driver to read
  1827. * beyond the end of the underlying file's current
  1828. * size; such areas read as zero.
  1829. */
  1830. ret |= BDRV_BLOCK_ZERO;
  1831. } else {
  1832. /* Limit request to the range reported by the protocol driver */
  1833. *pnum = file_pnum;
  1834. ret |= (ret2 & BDRV_BLOCK_ZERO);
  1835. }
  1836. }
  1837. }
  1838. out:
  1839. bdrv_dec_in_flight(bs);
  1840. if (ret >= 0 && offset + *pnum == total_size) {
  1841. ret |= BDRV_BLOCK_EOF;
  1842. }
  1843. early_out:
  1844. if (file) {
  1845. *file = local_file;
  1846. }
  1847. if (map) {
  1848. *map = local_map;
  1849. }
  1850. return ret;
  1851. }
  1852. static int coroutine_fn bdrv_co_block_status_above(BlockDriverState *bs,
  1853. BlockDriverState *base,
  1854. bool want_zero,
  1855. int64_t offset,
  1856. int64_t bytes,
  1857. int64_t *pnum,
  1858. int64_t *map,
  1859. BlockDriverState **file)
  1860. {
  1861. BlockDriverState *p;
  1862. int ret = 0;
  1863. bool first = true;
  1864. assert(bs != base);
  1865. for (p = bs; p != base; p = backing_bs(p)) {
  1866. ret = bdrv_co_block_status(p, want_zero, offset, bytes, pnum, map,
  1867. file);
  1868. if (ret < 0) {
  1869. break;
  1870. }
  1871. if (ret & BDRV_BLOCK_ZERO && ret & BDRV_BLOCK_EOF && !first) {
  1872. /*
  1873. * Reading beyond the end of the file continues to read
  1874. * zeroes, but we can only widen the result to the
  1875. * unallocated length we learned from an earlier
  1876. * iteration.
  1877. */
  1878. *pnum = bytes;
  1879. }
  1880. if (ret & (BDRV_BLOCK_ZERO | BDRV_BLOCK_DATA)) {
  1881. break;
  1882. }
  1883. /* [offset, pnum] unallocated on this layer, which could be only
  1884. * the first part of [offset, bytes]. */
  1885. bytes = MIN(bytes, *pnum);
  1886. first = false;
  1887. }
  1888. return ret;
  1889. }
  1890. /* Coroutine wrapper for bdrv_block_status_above() */
  1891. static void coroutine_fn bdrv_block_status_above_co_entry(void *opaque)
  1892. {
  1893. BdrvCoBlockStatusData *data = opaque;
  1894. data->ret = bdrv_co_block_status_above(data->bs, data->base,
  1895. data->want_zero,
  1896. data->offset, data->bytes,
  1897. data->pnum, data->map, data->file);
  1898. data->done = true;
  1899. aio_wait_kick();
  1900. }
  1901. /*
  1902. * Synchronous wrapper around bdrv_co_block_status_above().
  1903. *
  1904. * See bdrv_co_block_status_above() for details.
  1905. */
  1906. static int bdrv_common_block_status_above(BlockDriverState *bs,
  1907. BlockDriverState *base,
  1908. bool want_zero, int64_t offset,
  1909. int64_t bytes, int64_t *pnum,
  1910. int64_t *map,
  1911. BlockDriverState **file)
  1912. {
  1913. Coroutine *co;
  1914. BdrvCoBlockStatusData data = {
  1915. .bs = bs,
  1916. .base = base,
  1917. .want_zero = want_zero,
  1918. .offset = offset,
  1919. .bytes = bytes,
  1920. .pnum = pnum,
  1921. .map = map,
  1922. .file = file,
  1923. .done = false,
  1924. };
  1925. if (qemu_in_coroutine()) {
  1926. /* Fast-path if already in coroutine context */
  1927. bdrv_block_status_above_co_entry(&data);
  1928. } else {
  1929. co = qemu_coroutine_create(bdrv_block_status_above_co_entry, &data);
  1930. bdrv_coroutine_enter(bs, co);
  1931. BDRV_POLL_WHILE(bs, !data.done);
  1932. }
  1933. return data.ret;
  1934. }
  1935. int bdrv_block_status_above(BlockDriverState *bs, BlockDriverState *base,
  1936. int64_t offset, int64_t bytes, int64_t *pnum,
  1937. int64_t *map, BlockDriverState **file)
  1938. {
  1939. return bdrv_common_block_status_above(bs, base, true, offset, bytes,
  1940. pnum, map, file);
  1941. }
  1942. int bdrv_block_status(BlockDriverState *bs, int64_t offset, int64_t bytes,
  1943. int64_t *pnum, int64_t *map, BlockDriverState **file)
  1944. {
  1945. return bdrv_block_status_above(bs, backing_bs(bs),
  1946. offset, bytes, pnum, map, file);
  1947. }
  1948. int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t offset,
  1949. int64_t bytes, int64_t *pnum)
  1950. {
  1951. int ret;
  1952. int64_t dummy;
  1953. ret = bdrv_common_block_status_above(bs, backing_bs(bs), false, offset,
  1954. bytes, pnum ? pnum : &dummy, NULL,
  1955. NULL);
  1956. if (ret < 0) {
  1957. return ret;
  1958. }
  1959. return !!(ret & BDRV_BLOCK_ALLOCATED);
  1960. }
  1961. /*
  1962. * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
  1963. *
  1964. * Return true if (a prefix of) the given range is allocated in any image
  1965. * between BASE and TOP (inclusive). BASE can be NULL to check if the given
  1966. * offset is allocated in any image of the chain. Return false otherwise,
  1967. * or negative errno on failure.
  1968. *
  1969. * 'pnum' is set to the number of bytes (including and immediately
  1970. * following the specified offset) that are known to be in the same
  1971. * allocated/unallocated state. Note that a subsequent call starting
  1972. * at 'offset + *pnum' may return the same allocation status (in other
  1973. * words, the result is not necessarily the maximum possible range);
  1974. * but 'pnum' will only be 0 when end of file is reached.
  1975. *
  1976. */
  1977. int bdrv_is_allocated_above(BlockDriverState *top,
  1978. BlockDriverState *base,
  1979. int64_t offset, int64_t bytes, int64_t *pnum)
  1980. {
  1981. BlockDriverState *intermediate;
  1982. int ret;
  1983. int64_t n = bytes;
  1984. intermediate = top;
  1985. while (intermediate && intermediate != base) {
  1986. int64_t pnum_inter;
  1987. int64_t size_inter;
  1988. ret = bdrv_is_allocated(intermediate, offset, bytes, &pnum_inter);
  1989. if (ret < 0) {
  1990. return ret;
  1991. }
  1992. if (ret) {
  1993. *pnum = pnum_inter;
  1994. return 1;
  1995. }
  1996. size_inter = bdrv_getlength(intermediate);
  1997. if (size_inter < 0) {
  1998. return size_inter;
  1999. }
  2000. if (n > pnum_inter &&
  2001. (intermediate == top || offset + pnum_inter < size_inter)) {
  2002. n = pnum_inter;
  2003. }
  2004. intermediate = backing_bs(intermediate);
  2005. }
  2006. *pnum = n;
  2007. return 0;
  2008. }
  2009. typedef struct BdrvVmstateCo {
  2010. BlockDriverState *bs;
  2011. QEMUIOVector *qiov;
  2012. int64_t pos;
  2013. bool is_read;
  2014. int ret;
  2015. } BdrvVmstateCo;
  2016. static int coroutine_fn
  2017. bdrv_co_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos,
  2018. bool is_read)
  2019. {
  2020. BlockDriver *drv = bs->drv;
  2021. int ret = -ENOTSUP;
  2022. bdrv_inc_in_flight(bs);
  2023. if (!drv) {
  2024. ret = -ENOMEDIUM;
  2025. } else if (drv->bdrv_load_vmstate) {
  2026. if (is_read) {
  2027. ret = drv->bdrv_load_vmstate(bs, qiov, pos);
  2028. } else {
  2029. ret = drv->bdrv_save_vmstate(bs, qiov, pos);
  2030. }
  2031. } else if (bs->file) {
  2032. ret = bdrv_co_rw_vmstate(bs->file->bs, qiov, pos, is_read);
  2033. }
  2034. bdrv_dec_in_flight(bs);
  2035. return ret;
  2036. }
  2037. static void coroutine_fn bdrv_co_rw_vmstate_entry(void *opaque)
  2038. {
  2039. BdrvVmstateCo *co = opaque;
  2040. co->ret = bdrv_co_rw_vmstate(co->bs, co->qiov, co->pos, co->is_read);
  2041. aio_wait_kick();
  2042. }
  2043. static inline int
  2044. bdrv_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos,
  2045. bool is_read)
  2046. {
  2047. if (qemu_in_coroutine()) {
  2048. return bdrv_co_rw_vmstate(bs, qiov, pos, is_read);
  2049. } else {
  2050. BdrvVmstateCo data = {
  2051. .bs = bs,
  2052. .qiov = qiov,
  2053. .pos = pos,
  2054. .is_read = is_read,
  2055. .ret = -EINPROGRESS,
  2056. };
  2057. Coroutine *co = qemu_coroutine_create(bdrv_co_rw_vmstate_entry, &data);
  2058. bdrv_coroutine_enter(bs, co);
  2059. BDRV_POLL_WHILE(bs, data.ret == -EINPROGRESS);
  2060. return data.ret;
  2061. }
  2062. }
  2063. int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
  2064. int64_t pos, int size)
  2065. {
  2066. QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, size);
  2067. int ret;
  2068. ret = bdrv_writev_vmstate(bs, &qiov, pos);
  2069. if (ret < 0) {
  2070. return ret;
  2071. }
  2072. return size;
  2073. }
  2074. int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
  2075. {
  2076. return bdrv_rw_vmstate(bs, qiov, pos, false);
  2077. }
  2078. int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
  2079. int64_t pos, int size)
  2080. {
  2081. QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, size);
  2082. int ret;
  2083. ret = bdrv_readv_vmstate(bs, &qiov, pos);
  2084. if (ret < 0) {
  2085. return ret;
  2086. }
  2087. return size;
  2088. }
  2089. int bdrv_readv_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
  2090. {
  2091. return bdrv_rw_vmstate(bs, qiov, pos, true);
  2092. }
  2093. /**************************************************************/
  2094. /* async I/Os */
  2095. void bdrv_aio_cancel(BlockAIOCB *acb)
  2096. {
  2097. qemu_aio_ref(acb);
  2098. bdrv_aio_cancel_async(acb);
  2099. while (acb->refcnt > 1) {
  2100. if (acb->aiocb_info->get_aio_context) {
  2101. aio_poll(acb->aiocb_info->get_aio_context(acb), true);
  2102. } else if (acb->bs) {
  2103. /* qemu_aio_ref and qemu_aio_unref are not thread-safe, so
  2104. * assert that we're not using an I/O thread. Thread-safe
  2105. * code should use bdrv_aio_cancel_async exclusively.
  2106. */
  2107. assert(bdrv_get_aio_context(acb->bs) == qemu_get_aio_context());
  2108. aio_poll(bdrv_get_aio_context(acb->bs), true);
  2109. } else {
  2110. abort();
  2111. }
  2112. }
  2113. qemu_aio_unref(acb);
  2114. }
  2115. /* Async version of aio cancel. The caller is not blocked if the acb implements
  2116. * cancel_async, otherwise we do nothing and let the request normally complete.
  2117. * In either case the completion callback must be called. */
  2118. void bdrv_aio_cancel_async(BlockAIOCB *acb)
  2119. {
  2120. if (acb->aiocb_info->cancel_async) {
  2121. acb->aiocb_info->cancel_async(acb);
  2122. }
  2123. }
  2124. /**************************************************************/
  2125. /* Coroutine block device emulation */
  2126. typedef struct FlushCo {
  2127. BlockDriverState *bs;
  2128. int ret;
  2129. } FlushCo;
  2130. static void coroutine_fn bdrv_flush_co_entry(void *opaque)
  2131. {
  2132. FlushCo *rwco = opaque;
  2133. rwco->ret = bdrv_co_flush(rwco->bs);
  2134. aio_wait_kick();
  2135. }
  2136. int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
  2137. {
  2138. int current_gen;
  2139. int ret = 0;
  2140. bdrv_inc_in_flight(bs);
  2141. if (!bdrv_is_inserted(bs) || bdrv_is_read_only(bs) ||
  2142. bdrv_is_sg(bs)) {
  2143. goto early_exit;
  2144. }
  2145. qemu_co_mutex_lock(&bs->reqs_lock);
  2146. current_gen = atomic_read(&bs->write_gen);
  2147. /* Wait until any previous flushes are completed */
  2148. while (bs->active_flush_req) {
  2149. qemu_co_queue_wait(&bs->flush_queue, &bs->reqs_lock);
  2150. }
  2151. /* Flushes reach this point in nondecreasing current_gen order. */
  2152. bs->active_flush_req = true;
  2153. qemu_co_mutex_unlock(&bs->reqs_lock);
  2154. /* Write back all layers by calling one driver function */
  2155. if (bs->drv->bdrv_co_flush) {
  2156. ret = bs->drv->bdrv_co_flush(bs);
  2157. goto out;
  2158. }
  2159. /* Write back cached data to the OS even with cache=unsafe */
  2160. BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
  2161. if (bs->drv->bdrv_co_flush_to_os) {
  2162. ret = bs->drv->bdrv_co_flush_to_os(bs);
  2163. if (ret < 0) {
  2164. goto out;
  2165. }
  2166. }
  2167. /* But don't actually force it to the disk with cache=unsafe */
  2168. if (bs->open_flags & BDRV_O_NO_FLUSH) {
  2169. goto flush_parent;
  2170. }
  2171. /* Check if we really need to flush anything */
  2172. if (bs->flushed_gen == current_gen) {
  2173. goto flush_parent;
  2174. }
  2175. BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
  2176. if (!bs->drv) {
  2177. /* bs->drv->bdrv_co_flush() might have ejected the BDS
  2178. * (even in case of apparent success) */
  2179. ret = -ENOMEDIUM;
  2180. goto out;
  2181. }
  2182. if (bs->drv->bdrv_co_flush_to_disk) {
  2183. ret = bs->drv->bdrv_co_flush_to_disk(bs);
  2184. } else if (bs->drv->bdrv_aio_flush) {
  2185. BlockAIOCB *acb;
  2186. CoroutineIOCompletion co = {
  2187. .coroutine = qemu_coroutine_self(),
  2188. };
  2189. acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
  2190. if (acb == NULL) {
  2191. ret = -EIO;
  2192. } else {
  2193. qemu_coroutine_yield();
  2194. ret = co.ret;
  2195. }
  2196. } else {
  2197. /*
  2198. * Some block drivers always operate in either writethrough or unsafe
  2199. * mode and don't support bdrv_flush therefore. Usually qemu doesn't
  2200. * know how the server works (because the behaviour is hardcoded or
  2201. * depends on server-side configuration), so we can't ensure that
  2202. * everything is safe on disk. Returning an error doesn't work because
  2203. * that would break guests even if the server operates in writethrough
  2204. * mode.
  2205. *
  2206. * Let's hope the user knows what he's doing.
  2207. */
  2208. ret = 0;
  2209. }
  2210. if (ret < 0) {
  2211. goto out;
  2212. }
  2213. /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH
  2214. * in the case of cache=unsafe, so there are no useless flushes.
  2215. */
  2216. flush_parent:
  2217. ret = bs->file ? bdrv_co_flush(bs->file->bs) : 0;
  2218. out:
  2219. /* Notify any pending flushes that we have completed */
  2220. if (ret == 0) {
  2221. bs->flushed_gen = current_gen;
  2222. }
  2223. qemu_co_mutex_lock(&bs->reqs_lock);
  2224. bs->active_flush_req = false;
  2225. /* Return value is ignored - it's ok if wait queue is empty */
  2226. qemu_co_queue_next(&bs->flush_queue);
  2227. qemu_co_mutex_unlock(&bs->reqs_lock);
  2228. early_exit:
  2229. bdrv_dec_in_flight(bs);
  2230. return ret;
  2231. }
  2232. int bdrv_flush(BlockDriverState *bs)
  2233. {
  2234. Coroutine *co;
  2235. FlushCo flush_co = {
  2236. .bs = bs,
  2237. .ret = NOT_DONE,
  2238. };
  2239. if (qemu_in_coroutine()) {
  2240. /* Fast-path if already in coroutine context */
  2241. bdrv_flush_co_entry(&flush_co);
  2242. } else {
  2243. co = qemu_coroutine_create(bdrv_flush_co_entry, &flush_co);
  2244. bdrv_coroutine_enter(bs, co);
  2245. BDRV_POLL_WHILE(bs, flush_co.ret == NOT_DONE);
  2246. }
  2247. return flush_co.ret;
  2248. }
  2249. typedef struct DiscardCo {
  2250. BdrvChild *child;
  2251. int64_t offset;
  2252. int bytes;
  2253. int ret;
  2254. } DiscardCo;
  2255. static void coroutine_fn bdrv_pdiscard_co_entry(void *opaque)
  2256. {
  2257. DiscardCo *rwco = opaque;
  2258. rwco->ret = bdrv_co_pdiscard(rwco->child, rwco->offset, rwco->bytes);
  2259. aio_wait_kick();
  2260. }
  2261. int coroutine_fn bdrv_co_pdiscard(BdrvChild *child, int64_t offset, int bytes)
  2262. {
  2263. BdrvTrackedRequest req;
  2264. int max_pdiscard, ret;
  2265. int head, tail, align;
  2266. BlockDriverState *bs = child->bs;
  2267. if (!bs || !bs->drv) {
  2268. return -ENOMEDIUM;
  2269. }
  2270. if (bdrv_has_readonly_bitmaps(bs)) {
  2271. return -EPERM;
  2272. }
  2273. ret = bdrv_check_byte_request(bs, offset, bytes);
  2274. if (ret < 0) {
  2275. return ret;
  2276. }
  2277. /* Do nothing if disabled. */
  2278. if (!(bs->open_flags & BDRV_O_UNMAP)) {
  2279. return 0;
  2280. }
  2281. if (!bs->drv->bdrv_co_pdiscard && !bs->drv->bdrv_aio_pdiscard) {
  2282. return 0;
  2283. }
  2284. /* Discard is advisory, but some devices track and coalesce
  2285. * unaligned requests, so we must pass everything down rather than
  2286. * round here. Still, most devices will just silently ignore
  2287. * unaligned requests (by returning -ENOTSUP), so we must fragment
  2288. * the request accordingly. */
  2289. align = MAX(bs->bl.pdiscard_alignment, bs->bl.request_alignment);
  2290. assert(align % bs->bl.request_alignment == 0);
  2291. head = offset % align;
  2292. tail = (offset + bytes) % align;
  2293. bdrv_inc_in_flight(bs);
  2294. tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_DISCARD);
  2295. ret = bdrv_co_write_req_prepare(child, offset, bytes, &req, 0);
  2296. if (ret < 0) {
  2297. goto out;
  2298. }
  2299. max_pdiscard = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_pdiscard, INT_MAX),
  2300. align);
  2301. assert(max_pdiscard >= bs->bl.request_alignment);
  2302. while (bytes > 0) {
  2303. int num = bytes;
  2304. if (head) {
  2305. /* Make small requests to get to alignment boundaries. */
  2306. num = MIN(bytes, align - head);
  2307. if (!QEMU_IS_ALIGNED(num, bs->bl.request_alignment)) {
  2308. num %= bs->bl.request_alignment;
  2309. }
  2310. head = (head + num) % align;
  2311. assert(num < max_pdiscard);
  2312. } else if (tail) {
  2313. if (num > align) {
  2314. /* Shorten the request to the last aligned cluster. */
  2315. num -= tail;
  2316. } else if (!QEMU_IS_ALIGNED(tail, bs->bl.request_alignment) &&
  2317. tail > bs->bl.request_alignment) {
  2318. tail %= bs->bl.request_alignment;
  2319. num -= tail;
  2320. }
  2321. }
  2322. /* limit request size */
  2323. if (num > max_pdiscard) {
  2324. num = max_pdiscard;
  2325. }
  2326. if (!bs->drv) {
  2327. ret = -ENOMEDIUM;
  2328. goto out;
  2329. }
  2330. if (bs->drv->bdrv_co_pdiscard) {
  2331. ret = bs->drv->bdrv_co_pdiscard(bs, offset, num);
  2332. } else {
  2333. BlockAIOCB *acb;
  2334. CoroutineIOCompletion co = {
  2335. .coroutine = qemu_coroutine_self(),
  2336. };
  2337. acb = bs->drv->bdrv_aio_pdiscard(bs, offset, num,
  2338. bdrv_co_io_em_complete, &co);
  2339. if (acb == NULL) {
  2340. ret = -EIO;
  2341. goto out;
  2342. } else {
  2343. qemu_coroutine_yield();
  2344. ret = co.ret;
  2345. }
  2346. }
  2347. if (ret && ret != -ENOTSUP) {
  2348. goto out;
  2349. }
  2350. offset += num;
  2351. bytes -= num;
  2352. }
  2353. ret = 0;
  2354. out:
  2355. bdrv_co_write_req_finish(child, req.offset, req.bytes, &req, ret);
  2356. tracked_request_end(&req);
  2357. bdrv_dec_in_flight(bs);
  2358. return ret;
  2359. }
  2360. int bdrv_pdiscard(BdrvChild *child, int64_t offset, int bytes)
  2361. {
  2362. Coroutine *co;
  2363. DiscardCo rwco = {
  2364. .child = child,
  2365. .offset = offset,
  2366. .bytes = bytes,
  2367. .ret = NOT_DONE,
  2368. };
  2369. if (qemu_in_coroutine()) {
  2370. /* Fast-path if already in coroutine context */
  2371. bdrv_pdiscard_co_entry(&rwco);
  2372. } else {
  2373. co = qemu_coroutine_create(bdrv_pdiscard_co_entry, &rwco);
  2374. bdrv_coroutine_enter(child->bs, co);
  2375. BDRV_POLL_WHILE(child->bs, rwco.ret == NOT_DONE);
  2376. }
  2377. return rwco.ret;
  2378. }
  2379. int bdrv_co_ioctl(BlockDriverState *bs, int req, void *buf)
  2380. {
  2381. BlockDriver *drv = bs->drv;
  2382. CoroutineIOCompletion co = {
  2383. .coroutine = qemu_coroutine_self(),
  2384. };
  2385. BlockAIOCB *acb;
  2386. bdrv_inc_in_flight(bs);
  2387. if (!drv || (!drv->bdrv_aio_ioctl && !drv->bdrv_co_ioctl)) {
  2388. co.ret = -ENOTSUP;
  2389. goto out;
  2390. }
  2391. if (drv->bdrv_co_ioctl) {
  2392. co.ret = drv->bdrv_co_ioctl(bs, req, buf);
  2393. } else {
  2394. acb = drv->bdrv_aio_ioctl(bs, req, buf, bdrv_co_io_em_complete, &co);
  2395. if (!acb) {
  2396. co.ret = -ENOTSUP;
  2397. goto out;
  2398. }
  2399. qemu_coroutine_yield();
  2400. }
  2401. out:
  2402. bdrv_dec_in_flight(bs);
  2403. return co.ret;
  2404. }
  2405. void *qemu_blockalign(BlockDriverState *bs, size_t size)
  2406. {
  2407. return qemu_memalign(bdrv_opt_mem_align(bs), size);
  2408. }
  2409. void *qemu_blockalign0(BlockDriverState *bs, size_t size)
  2410. {
  2411. return memset(qemu_blockalign(bs, size), 0, size);
  2412. }
  2413. void *qemu_try_blockalign(BlockDriverState *bs, size_t size)
  2414. {
  2415. size_t align = bdrv_opt_mem_align(bs);
  2416. /* Ensure that NULL is never returned on success */
  2417. assert(align > 0);
  2418. if (size == 0) {
  2419. size = align;
  2420. }
  2421. return qemu_try_memalign(align, size);
  2422. }
  2423. void *qemu_try_blockalign0(BlockDriverState *bs, size_t size)
  2424. {
  2425. void *mem = qemu_try_blockalign(bs, size);
  2426. if (mem) {
  2427. memset(mem, 0, size);
  2428. }
  2429. return mem;
  2430. }
  2431. /*
  2432. * Check if all memory in this vector is sector aligned.
  2433. */
  2434. bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
  2435. {
  2436. int i;
  2437. size_t alignment = bdrv_min_mem_align(bs);
  2438. for (i = 0; i < qiov->niov; i++) {
  2439. if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
  2440. return false;
  2441. }
  2442. if (qiov->iov[i].iov_len % alignment) {
  2443. return false;
  2444. }
  2445. }
  2446. return true;
  2447. }
  2448. void bdrv_add_before_write_notifier(BlockDriverState *bs,
  2449. NotifierWithReturn *notifier)
  2450. {
  2451. notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
  2452. }
  2453. void bdrv_io_plug(BlockDriverState *bs)
  2454. {
  2455. BdrvChild *child;
  2456. QLIST_FOREACH(child, &bs->children, next) {
  2457. bdrv_io_plug(child->bs);
  2458. }
  2459. if (atomic_fetch_inc(&bs->io_plugged) == 0) {
  2460. BlockDriver *drv = bs->drv;
  2461. if (drv && drv->bdrv_io_plug) {
  2462. drv->bdrv_io_plug(bs);
  2463. }
  2464. }
  2465. }
  2466. void bdrv_io_unplug(BlockDriverState *bs)
  2467. {
  2468. BdrvChild *child;
  2469. assert(bs->io_plugged);
  2470. if (atomic_fetch_dec(&bs->io_plugged) == 1) {
  2471. BlockDriver *drv = bs->drv;
  2472. if (drv && drv->bdrv_io_unplug) {
  2473. drv->bdrv_io_unplug(bs);
  2474. }
  2475. }
  2476. QLIST_FOREACH(child, &bs->children, next) {
  2477. bdrv_io_unplug(child->bs);
  2478. }
  2479. }
  2480. void bdrv_register_buf(BlockDriverState *bs, void *host, size_t size)
  2481. {
  2482. BdrvChild *child;
  2483. if (bs->drv && bs->drv->bdrv_register_buf) {
  2484. bs->drv->bdrv_register_buf(bs, host, size);
  2485. }
  2486. QLIST_FOREACH(child, &bs->children, next) {
  2487. bdrv_register_buf(child->bs, host, size);
  2488. }
  2489. }
  2490. void bdrv_unregister_buf(BlockDriverState *bs, void *host)
  2491. {
  2492. BdrvChild *child;
  2493. if (bs->drv && bs->drv->bdrv_unregister_buf) {
  2494. bs->drv->bdrv_unregister_buf(bs, host);
  2495. }
  2496. QLIST_FOREACH(child, &bs->children, next) {
  2497. bdrv_unregister_buf(child->bs, host);
  2498. }
  2499. }
  2500. static int coroutine_fn bdrv_co_copy_range_internal(
  2501. BdrvChild *src, uint64_t src_offset, BdrvChild *dst,
  2502. uint64_t dst_offset, uint64_t bytes,
  2503. BdrvRequestFlags read_flags, BdrvRequestFlags write_flags,
  2504. bool recurse_src)
  2505. {
  2506. BdrvTrackedRequest req;
  2507. int ret;
  2508. /* TODO We can support BDRV_REQ_NO_FALLBACK here */
  2509. assert(!(read_flags & BDRV_REQ_NO_FALLBACK));
  2510. assert(!(write_flags & BDRV_REQ_NO_FALLBACK));
  2511. if (!dst || !dst->bs) {
  2512. return -ENOMEDIUM;
  2513. }
  2514. ret = bdrv_check_byte_request(dst->bs, dst_offset, bytes);
  2515. if (ret) {
  2516. return ret;
  2517. }
  2518. if (write_flags & BDRV_REQ_ZERO_WRITE) {
  2519. return bdrv_co_pwrite_zeroes(dst, dst_offset, bytes, write_flags);
  2520. }
  2521. if (!src || !src->bs) {
  2522. return -ENOMEDIUM;
  2523. }
  2524. ret = bdrv_check_byte_request(src->bs, src_offset, bytes);
  2525. if (ret) {
  2526. return ret;
  2527. }
  2528. if (!src->bs->drv->bdrv_co_copy_range_from
  2529. || !dst->bs->drv->bdrv_co_copy_range_to
  2530. || src->bs->encrypted || dst->bs->encrypted) {
  2531. return -ENOTSUP;
  2532. }
  2533. if (recurse_src) {
  2534. bdrv_inc_in_flight(src->bs);
  2535. tracked_request_begin(&req, src->bs, src_offset, bytes,
  2536. BDRV_TRACKED_READ);
  2537. /* BDRV_REQ_SERIALISING is only for write operation */
  2538. assert(!(read_flags & BDRV_REQ_SERIALISING));
  2539. if (!(read_flags & BDRV_REQ_NO_SERIALISING)) {
  2540. wait_serialising_requests(&req);
  2541. }
  2542. ret = src->bs->drv->bdrv_co_copy_range_from(src->bs,
  2543. src, src_offset,
  2544. dst, dst_offset,
  2545. bytes,
  2546. read_flags, write_flags);
  2547. tracked_request_end(&req);
  2548. bdrv_dec_in_flight(src->bs);
  2549. } else {
  2550. bdrv_inc_in_flight(dst->bs);
  2551. tracked_request_begin(&req, dst->bs, dst_offset, bytes,
  2552. BDRV_TRACKED_WRITE);
  2553. ret = bdrv_co_write_req_prepare(dst, dst_offset, bytes, &req,
  2554. write_flags);
  2555. if (!ret) {
  2556. ret = dst->bs->drv->bdrv_co_copy_range_to(dst->bs,
  2557. src, src_offset,
  2558. dst, dst_offset,
  2559. bytes,
  2560. read_flags, write_flags);
  2561. }
  2562. bdrv_co_write_req_finish(dst, dst_offset, bytes, &req, ret);
  2563. tracked_request_end(&req);
  2564. bdrv_dec_in_flight(dst->bs);
  2565. }
  2566. return ret;
  2567. }
  2568. /* Copy range from @src to @dst.
  2569. *
  2570. * See the comment of bdrv_co_copy_range for the parameter and return value
  2571. * semantics. */
  2572. int coroutine_fn bdrv_co_copy_range_from(BdrvChild *src, uint64_t src_offset,
  2573. BdrvChild *dst, uint64_t dst_offset,
  2574. uint64_t bytes,
  2575. BdrvRequestFlags read_flags,
  2576. BdrvRequestFlags write_flags)
  2577. {
  2578. trace_bdrv_co_copy_range_from(src, src_offset, dst, dst_offset, bytes,
  2579. read_flags, write_flags);
  2580. return bdrv_co_copy_range_internal(src, src_offset, dst, dst_offset,
  2581. bytes, read_flags, write_flags, true);
  2582. }
  2583. /* Copy range from @src to @dst.
  2584. *
  2585. * See the comment of bdrv_co_copy_range for the parameter and return value
  2586. * semantics. */
  2587. int coroutine_fn bdrv_co_copy_range_to(BdrvChild *src, uint64_t src_offset,
  2588. BdrvChild *dst, uint64_t dst_offset,
  2589. uint64_t bytes,
  2590. BdrvRequestFlags read_flags,
  2591. BdrvRequestFlags write_flags)
  2592. {
  2593. trace_bdrv_co_copy_range_to(src, src_offset, dst, dst_offset, bytes,
  2594. read_flags, write_flags);
  2595. return bdrv_co_copy_range_internal(src, src_offset, dst, dst_offset,
  2596. bytes, read_flags, write_flags, false);
  2597. }
  2598. int coroutine_fn bdrv_co_copy_range(BdrvChild *src, uint64_t src_offset,
  2599. BdrvChild *dst, uint64_t dst_offset,
  2600. uint64_t bytes, BdrvRequestFlags read_flags,
  2601. BdrvRequestFlags write_flags)
  2602. {
  2603. return bdrv_co_copy_range_from(src, src_offset,
  2604. dst, dst_offset,
  2605. bytes, read_flags, write_flags);
  2606. }
  2607. static void bdrv_parent_cb_resize(BlockDriverState *bs)
  2608. {
  2609. BdrvChild *c;
  2610. QLIST_FOREACH(c, &bs->parents, next_parent) {
  2611. if (c->role->resize) {
  2612. c->role->resize(c);
  2613. }
  2614. }
  2615. }
  2616. /**
  2617. * Truncate file to 'offset' bytes (needed only for file protocols)
  2618. */
  2619. int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset,
  2620. PreallocMode prealloc, Error **errp)
  2621. {
  2622. BlockDriverState *bs = child->bs;
  2623. BlockDriver *drv = bs->drv;
  2624. BdrvTrackedRequest req;
  2625. int64_t old_size, new_bytes;
  2626. int ret;
  2627. /* if bs->drv == NULL, bs is closed, so there's nothing to do here */
  2628. if (!drv) {
  2629. error_setg(errp, "No medium inserted");
  2630. return -ENOMEDIUM;
  2631. }
  2632. if (offset < 0) {
  2633. error_setg(errp, "Image size cannot be negative");
  2634. return -EINVAL;
  2635. }
  2636. old_size = bdrv_getlength(bs);
  2637. if (old_size < 0) {
  2638. error_setg_errno(errp, -old_size, "Failed to get old image size");
  2639. return old_size;
  2640. }
  2641. if (offset > old_size) {
  2642. new_bytes = offset - old_size;
  2643. } else {
  2644. new_bytes = 0;
  2645. }
  2646. bdrv_inc_in_flight(bs);
  2647. tracked_request_begin(&req, bs, offset - new_bytes, new_bytes,
  2648. BDRV_TRACKED_TRUNCATE);
  2649. /* If we are growing the image and potentially using preallocation for the
  2650. * new area, we need to make sure that no write requests are made to it
  2651. * concurrently or they might be overwritten by preallocation. */
  2652. if (new_bytes) {
  2653. mark_request_serialising(&req, 1);
  2654. }
  2655. if (bs->read_only) {
  2656. error_setg(errp, "Image is read-only");
  2657. ret = -EACCES;
  2658. goto out;
  2659. }
  2660. ret = bdrv_co_write_req_prepare(child, offset - new_bytes, new_bytes, &req,
  2661. 0);
  2662. if (ret < 0) {
  2663. error_setg_errno(errp, -ret,
  2664. "Failed to prepare request for truncation");
  2665. goto out;
  2666. }
  2667. if (!drv->bdrv_co_truncate) {
  2668. if (bs->file && drv->is_filter) {
  2669. ret = bdrv_co_truncate(bs->file, offset, prealloc, errp);
  2670. goto out;
  2671. }
  2672. error_setg(errp, "Image format driver does not support resize");
  2673. ret = -ENOTSUP;
  2674. goto out;
  2675. }
  2676. ret = drv->bdrv_co_truncate(bs, offset, prealloc, errp);
  2677. if (ret < 0) {
  2678. goto out;
  2679. }
  2680. ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
  2681. if (ret < 0) {
  2682. error_setg_errno(errp, -ret, "Could not refresh total sector count");
  2683. } else {
  2684. offset = bs->total_sectors * BDRV_SECTOR_SIZE;
  2685. }
  2686. /* It's possible that truncation succeeded but refresh_total_sectors
  2687. * failed, but the latter doesn't affect how we should finish the request.
  2688. * Pass 0 as the last parameter so that dirty bitmaps etc. are handled. */
  2689. bdrv_co_write_req_finish(child, offset - new_bytes, new_bytes, &req, 0);
  2690. out:
  2691. tracked_request_end(&req);
  2692. bdrv_dec_in_flight(bs);
  2693. return ret;
  2694. }
  2695. typedef struct TruncateCo {
  2696. BdrvChild *child;
  2697. int64_t offset;
  2698. PreallocMode prealloc;
  2699. Error **errp;
  2700. int ret;
  2701. } TruncateCo;
  2702. static void coroutine_fn bdrv_truncate_co_entry(void *opaque)
  2703. {
  2704. TruncateCo *tco = opaque;
  2705. tco->ret = bdrv_co_truncate(tco->child, tco->offset, tco->prealloc,
  2706. tco->errp);
  2707. aio_wait_kick();
  2708. }
  2709. int bdrv_truncate(BdrvChild *child, int64_t offset, PreallocMode prealloc,
  2710. Error **errp)
  2711. {
  2712. Coroutine *co;
  2713. TruncateCo tco = {
  2714. .child = child,
  2715. .offset = offset,
  2716. .prealloc = prealloc,
  2717. .errp = errp,
  2718. .ret = NOT_DONE,
  2719. };
  2720. if (qemu_in_coroutine()) {
  2721. /* Fast-path if already in coroutine context */
  2722. bdrv_truncate_co_entry(&tco);
  2723. } else {
  2724. co = qemu_coroutine_create(bdrv_truncate_co_entry, &tco);
  2725. bdrv_coroutine_enter(child->bs, co);
  2726. BDRV_POLL_WHILE(child->bs, tco.ret == NOT_DONE);
  2727. }
  2728. return tco.ret;
  2729. }