aboutsummaryrefslogtreecommitdiff
path: root/sys
diff options
context:
space:
mode:
authorRick Macklem <rmacklem@FreeBSD.org>2019-07-25 05:46:16 +0000
committerRick Macklem <rmacklem@FreeBSD.org>2019-07-25 05:46:16 +0000
commitbbbbeca3e9a31483f5a24b81cd9426a36085e75e (patch)
treec3a2167140c061b34660089301d37a4b0586b854 /sys
parentab8cabb1ca9dbff39da55b17d8810a3bcefc6c80 (diff)
downloadsrc-bbbbeca3e9a31483f5a24b81cd9426a36085e75e.tar.gz
src-bbbbeca3e9a31483f5a24b81cd9426a36085e75e.zip
Notes
Diffstat (limited to 'sys')
-rw-r--r--sys/kern/syscalls.master10
-rw-r--r--sys/kern/vfs_default.c13
-rw-r--r--sys/kern/vfs_syscalls.c119
-rw-r--r--sys/kern/vfs_vnops.c369
-rw-r--r--sys/kern/vnode_if.src16
-rw-r--r--sys/sys/syscallsubr.h2
-rw-r--r--sys/sys/vnode.h8
7 files changed, 537 insertions, 0 deletions
diff --git a/sys/kern/syscalls.master b/sys/kern/syscalls.master
index f85c0ab3d2ff..879e2589c6bd 100644
--- a/sys/kern/syscalls.master
+++ b/sys/kern/syscalls.master
@@ -3175,6 +3175,16 @@
int flag
);
}
+569 AUE_NULL STD {
+ ssize_t copy_file_range(
+ int infd,
+ _Inout_opt_ off_t *inoffp,
+ int outfd,
+ _Inout_opt_ off_t *outoffp,
+ size_t len,
+ unsigned int flags
+ );
+ }
; Please copy any additions and changes to the following compatability tables:
; sys/compat/freebsd32/syscalls.master
diff --git a/sys/kern/vfs_default.c b/sys/kern/vfs_default.c
index 3c30d2e6c9f1..920d6f19c8a9 100644
--- a/sys/kern/vfs_default.c
+++ b/sys/kern/vfs_default.c
@@ -83,6 +83,7 @@ static int dirent_exists(struct vnode *vp, const char *dirname,
static int vop_stdis_text(struct vop_is_text_args *ap);
static int vop_stdunset_text(struct vop_unset_text_args *ap);
static int vop_stdadd_writecount(struct vop_add_writecount_args *ap);
+static int vop_stdcopy_file_range(struct vop_copy_file_range_args *ap);
static int vop_stdfdatasync(struct vop_fdatasync_args *ap);
static int vop_stdgetpages_async(struct vop_getpages_async_args *ap);
@@ -140,6 +141,7 @@ struct vop_vector default_vnodeops = {
.vop_set_text = vop_stdset_text,
.vop_unset_text = vop_stdunset_text,
.vop_add_writecount = vop_stdadd_writecount,
+ .vop_copy_file_range = vop_stdcopy_file_range,
};
/*
@@ -1212,6 +1214,17 @@ vfs_stdnosync (mp, waitfor)
return (0);
}
+static int
+vop_stdcopy_file_range(struct vop_copy_file_range_args *ap)
+{
+ int error;
+
+ error = vn_generic_copy_file_range(ap->a_invp, ap->a_inoffp,
+ ap->a_outvp, ap->a_outoffp, ap->a_lenp, ap->a_flags, ap->a_incred,
+ ap->a_outcred, ap->a_fsizetd);
+ return (error);
+}
+
int
vfs_stdvget (mp, ino, flags, vpp)
struct mount *mp;
diff --git a/sys/kern/vfs_syscalls.c b/sys/kern/vfs_syscalls.c
index 5d9a59145d0e..ff0fc9f9867c 100644
--- a/sys/kern/vfs_syscalls.c
+++ b/sys/kern/vfs_syscalls.c
@@ -4814,3 +4814,122 @@ sys_posix_fadvise(struct thread *td, struct posix_fadvise_args *uap)
uap->advice);
return (kern_posix_error(td, error));
}
+
+int
+kern_copy_file_range(struct thread *td, int infd, off_t *inoffp, int outfd,
+ off_t *outoffp, size_t len, unsigned int flags)
+{
+ struct file *infp, *outfp;
+ struct vnode *invp, *outvp;
+ int error;
+ size_t retlen;
+ void *rl_rcookie, *rl_wcookie;
+ off_t savinoff, savoutoff;
+
+ infp = outfp = NULL;
+ rl_rcookie = rl_wcookie = NULL;
+ savinoff = -1;
+ error = 0;
+ retlen = 0;
+
+ if (flags != 0) {
+ error = EINVAL;
+ goto out;
+ }
+ if (len > SSIZE_MAX)
+ /*
+ * Although the len argument is size_t, the return argument
+ * is ssize_t (which is signed). Therefore a size that won't
+ * fit in ssize_t can't be returned.
+ */
+ len = SSIZE_MAX;
+
+ /* Get the file structures for the file descriptors. */
+ error = fget_read(td, infd, &cap_read_rights, &infp);
+ if (error != 0)
+ goto out;
+ error = fget_write(td, outfd, &cap_write_rights, &outfp);
+ if (error != 0)
+ goto out;
+
+ /* Set the offset pointers to the correct place. */
+ if (inoffp == NULL)
+ inoffp = &infp->f_offset;
+ if (outoffp == NULL)
+ outoffp = &outfp->f_offset;
+ savinoff = *inoffp;
+ savoutoff = *outoffp;
+
+ invp = infp->f_vnode;
+ outvp = outfp->f_vnode;
+ /* Sanity check the f_flag bits. */
+ if ((outfp->f_flag & (FWRITE | FAPPEND)) != FWRITE ||
+ (infp->f_flag & FREAD) == 0 || invp == outvp) {
+ error = EBADF;
+ goto out;
+ }
+
+ /* If len == 0, just return 0. */
+ if (len == 0)
+ goto out;
+
+ /* Range lock the byte ranges for both invp and outvp. */
+ for (;;) {
+ rl_wcookie = vn_rangelock_wlock(outvp, *outoffp, *outoffp +
+ len);
+ rl_rcookie = vn_rangelock_tryrlock(invp, *inoffp, *inoffp +
+ len);
+ if (rl_rcookie != NULL)
+ break;
+ vn_rangelock_unlock(outvp, rl_wcookie);
+ rl_rcookie = vn_rangelock_rlock(invp, *inoffp, *inoffp + len);
+ vn_rangelock_unlock(invp, rl_rcookie);
+ }
+
+ retlen = len;
+ error = vn_copy_file_range(invp, inoffp, outvp, outoffp, &retlen,
+ flags, infp->f_cred, outfp->f_cred, td);
+out:
+ if (rl_rcookie != NULL)
+ vn_rangelock_unlock(invp, rl_rcookie);
+ if (rl_wcookie != NULL)
+ vn_rangelock_unlock(outvp, rl_wcookie);
+ if (savinoff != -1 && (error == EINTR || error == ERESTART)) {
+ *inoffp = savinoff;
+ *outoffp = savoutoff;
+ }
+ if (outfp != NULL)
+ fdrop(outfp, td);
+ if (infp != NULL)
+ fdrop(infp, td);
+ td->td_retval[0] = retlen;
+ return (error);
+}
+
+int
+sys_copy_file_range(struct thread *td, struct copy_file_range_args *uap)
+{
+ off_t inoff, outoff, *inoffp, *outoffp;
+ int error;
+
+ inoffp = outoffp = NULL;
+ if (uap->inoffp != NULL) {
+ error = copyin(uap->inoffp, &inoff, sizeof(off_t));
+ if (error != 0)
+ return (error);
+ inoffp = &inoff;
+ }
+ if (uap->outoffp != NULL) {
+ error = copyin(uap->outoffp, &outoff, sizeof(off_t));
+ if (error != 0)
+ return (error);
+ outoffp = &outoff;
+ }
+ error = kern_copy_file_range(td, uap->infd, inoffp, uap->outfd,
+ outoffp, uap->len, uap->flags);
+ if (error == 0 && uap->inoffp != NULL)
+ error = copyout(inoffp, uap->inoffp, sizeof(off_t));
+ if (error == 0 && uap->outoffp != NULL)
+ error = copyout(outoffp, uap->outoffp, sizeof(off_t));
+ return (error);
+}
diff --git a/sys/kern/vfs_vnops.c b/sys/kern/vfs_vnops.c
index 4649848639dd..b6fe702d7b0a 100644
--- a/sys/kern/vfs_vnops.c
+++ b/sys/kern/vfs_vnops.c
@@ -2619,3 +2619,372 @@ loop2:
return (error);
}
+
+/*
+ * Copies a byte range from invp to outvp. Calls VOP_COPY_FILE_RANGE()
+ * or vn_generic_copy_file_range() after rangelocking the byte ranges,
+ * to do the actual copy.
+ * vn_generic_copy_file_range() is factored out, so it can be called
+ * from a VOP_COPY_FILE_RANGE() call as well, but handles vnodes from
+ * different file systems.
+ */
+int
+vn_copy_file_range(struct vnode *invp, off_t *inoffp, struct vnode *outvp,
+ off_t *outoffp, size_t *lenp, unsigned int flags, struct ucred *incred,
+ struct ucred *outcred, struct thread *fsize_td)
+{
+ struct vattr va;
+ int error;
+ size_t len;
+ uint64_t uvalin, uvalout;
+
+ len = *lenp;
+ *lenp = 0; /* For error returns. */
+ error = 0;
+
+ /* Do some sanity checks on the arguments. */
+ uvalin = *inoffp;
+ uvalin += len;
+ uvalout = *outoffp;
+ uvalout += len;
+ if (invp->v_type == VDIR || outvp->v_type == VDIR)
+ error = EISDIR;
+ else if (*inoffp < 0 || uvalin > INT64_MAX || uvalin <
+ (uint64_t)*inoffp || *outoffp < 0 || uvalout > INT64_MAX ||
+ uvalout < (uint64_t)*outoffp || invp->v_type != VREG ||
+ outvp->v_type != VREG)
+ error = EINVAL;
+ else if (invp == outvp)
+ error = EBADF;
+ if (error != 0)
+ goto out;
+
+ error = vn_lock(invp, LK_SHARED);
+ if (error != 0)
+ goto out;
+ /* Check that the offset + len does not go past EOF of invp. */
+ error = VOP_GETATTR(invp, &va, incred);
+ if (error == 0 && va.va_size < *inoffp + len)
+ error = EINVAL;
+ VOP_UNLOCK(invp, 0);
+ if (error != 0)
+ goto out;
+
+ /*
+ * If the two vnode are for the same file system, call
+ * VOP_COPY_FILE_RANGE(), otherwise call vn_generic_copy_file_range()
+ * which can handle copies across multiple file systems.
+ */
+ *lenp = len;
+ if (invp->v_mount == outvp->v_mount)
+ error = VOP_COPY_FILE_RANGE(invp, inoffp, outvp, outoffp,
+ lenp, flags, incred, outcred, fsize_td);
+ else
+ error = vn_generic_copy_file_range(invp, inoffp, outvp,
+ outoffp, lenp, flags, incred, outcred, fsize_td);
+out:
+ return (error);
+}
+
+/*
+ * Test len bytes of data starting at dat for all bytes == 0.
+ * Return true if all bytes are zero, false otherwise.
+ * Expects dat to be well aligned.
+ */
+static bool
+mem_iszero(void *dat, int len)
+{
+ int i;
+ const u_int *p;
+ const char *cp;
+
+ for (p = dat; len > 0; len -= sizeof(*p), p++) {
+ if (len >= sizeof(*p)) {
+ if (*p != 0)
+ return (false);
+ } else {
+ cp = (const char *)p;
+ for (i = 0; i < len; i++, cp++)
+ if (*cp != '\0')
+ return (false);
+ }
+ }
+ return (true);
+}
+
+/*
+ * Write an xfer sized chunk to outvp in blksize blocks from dat.
+ * dat is a maximum of blksize in length and can be written repeatedly in
+ * the chunk.
+ * If growfile == true, just grow the file via vn_truncate_locked() instead
+ * of doing actual writes.
+ */
+static int
+vn_write_outvp(struct vnode *outvp, char *dat, off_t outoff, off_t xfer,
+ u_long blksize, bool growfile, struct ucred *cred)
+{
+ struct mount *mp;
+ off_t xfer2;
+ int error, lckf;
+
+ /*
+ * Loop around doing writes of blksize until write has been completed.
+ * Lock/unlock on each loop iteration so that a bwillwrite() can be
+ * done for each iteration, since the xfer argument can be very
+ * large if there is a large hole to punch in the output file.
+ */
+ do {
+ bwillwrite();
+ mp = NULL;
+ error = vn_start_write(outvp, &mp, V_WAIT);
+ if (error == 0) {
+ if (MNT_SHARED_WRITES(mp))
+ lckf = LK_SHARED;
+ else
+ lckf = LK_EXCLUSIVE;
+ error = vn_lock(outvp, lckf);
+ }
+ if (error == 0) {
+ if (growfile)
+ error = vn_truncate_locked(outvp, outoff + xfer,
+ false, cred);
+ else {
+ xfer2 = MIN(xfer, blksize);
+ error = vn_rdwr(UIO_WRITE, outvp, dat, xfer2,
+ outoff, UIO_SYSSPACE, IO_NODELOCKED,
+ curthread->td_ucred, cred, NULL, curthread);
+ outoff += xfer2;
+ xfer -= xfer2;
+ }
+ VOP_UNLOCK(outvp, 0);
+ }
+ if (mp != NULL)
+ vn_finished_write(mp);
+ } while (!growfile && xfer > 0 && error == 0);
+ return (error);
+}
+
+/*
+ * Copy a byte range of one file to another. This function can handle the
+ * case where invp and outvp are on different file systems.
+ * It can also be called by a VOP_COPY_FILE_RANGE() to do the work, if there
+ * is no better file system specific way to do it.
+ */
+int
+vn_generic_copy_file_range(struct vnode *invp, off_t *inoffp,
+ struct vnode *outvp, off_t *outoffp, size_t *lenp, unsigned int flags,
+ struct ucred *incred, struct ucred *outcred, struct thread *fsize_td)
+{
+ struct vattr va;
+ struct mount *mp;
+ struct uio io;
+ off_t startoff, endoff, xfer, xfer2;
+ u_long blksize;
+ int error;
+ bool cantseek, readzeros;
+ ssize_t aresid;
+ size_t copylen, len, savlen;
+ char *dat;
+ long holein, holeout;
+
+ holein = holeout = 0;
+ savlen = len = *lenp;
+ error = 0;
+ dat = NULL;
+
+ error = vn_lock(invp, LK_SHARED);
+ if (error != 0)
+ goto out;
+ if (VOP_PATHCONF(invp, _PC_MIN_HOLE_SIZE, &holein) != 0)
+ holein = 0;
+ VOP_UNLOCK(invp, 0);
+ if (error != 0)
+ goto out;
+
+ mp = NULL;
+ error = vn_start_write(outvp, &mp, V_WAIT);
+ if (error == 0)
+ error = vn_lock(outvp, LK_EXCLUSIVE);
+ if (error == 0) {
+ /*
+ * If fsize_td != NULL, do a vn_rlimit_fsize() call,
+ * now that outvp is locked.
+ */
+ if (fsize_td != NULL) {
+ io.uio_offset = *outoffp;
+ io.uio_resid = len;
+ error = vn_rlimit_fsize(outvp, &io, fsize_td);
+ if (error != 0)
+ error = EFBIG;
+ }
+ if (VOP_PATHCONF(outvp, _PC_MIN_HOLE_SIZE, &holeout) != 0)
+ holeout = 0;
+ /*
+ * Holes that are past EOF do not need to be written as a block
+ * of zero bytes. So, truncate the output file as far as
+ * possible and then use va.va_size to decide if writing 0
+ * bytes is necessary in the loop below.
+ */
+ if (error == 0)
+ error = VOP_GETATTR(outvp, &va, outcred);
+ if (error == 0 && va.va_size > *outoffp && va.va_size <=
+ *outoffp + len) {
+#ifdef MAC
+ error = mac_vnode_check_write(curthread->td_ucred,
+ outcred, outvp);
+ if (error == 0)
+#endif
+ error = vn_truncate_locked(outvp, *outoffp,
+ false, outcred);
+ if (error == 0)
+ va.va_size = *outoffp;
+ }
+ VOP_UNLOCK(outvp, 0);
+ }
+ if (mp != NULL)
+ vn_finished_write(mp);
+ if (error != 0)
+ goto out;
+
+ /*
+ * Set the blksize to the larger of the hole sizes for invp and outvp.
+ * If hole sizes aren't available, set the blksize to the larger
+ * f_iosize of invp and outvp.
+ * This code expects the hole sizes and f_iosizes to be powers of 2.
+ * This value is clipped at 4Kbytes and 1Mbyte.
+ */
+ blksize = MAX(holein, holeout);
+ if (blksize == 0)
+ blksize = MAX(invp->v_mount->mnt_stat.f_iosize,
+ outvp->v_mount->mnt_stat.f_iosize);
+ if (blksize < 4096)
+ blksize = 4096;
+ else if (blksize > 1024 * 1024)
+ blksize = 1024 * 1024;
+ dat = malloc(blksize, M_TEMP, M_WAITOK);
+
+ /*
+ * If VOP_IOCTL(FIOSEEKHOLE) works for invp, use it and FIOSEEKDATA
+ * to find holes. Otherwise, just scan the read block for all 0s
+ * in the inner loop where the data copying is done.
+ * Note that some file systems such as NFSv3, NFSv4.0 and NFSv4.1 may
+ * support holes on the server, but do not support FIOSEEKHOLE.
+ */
+ while (len > 0 && error == 0) {
+ endoff = 0; /* To shut up compilers. */
+ cantseek = true;
+ startoff = *inoffp;
+ copylen = len;
+
+ /*
+ * Find the next data area. If there is just a hole to EOF,
+ * FIOSEEKDATA should fail and then we drop down into the
+ * inner loop and create the hole on the outvp file.
+ * (I do not know if any file system will report a hole to
+ * EOF via FIOSEEKHOLE, but I am pretty sure FIOSEEKDATA
+ * will fail for those file systems.)
+ *
+ * For input files that don't support FIOSEEKDATA/FIOSEEKHOLE,
+ * the code just falls through to the inner copy loop.
+ */
+ error = EINVAL;
+ if (holein > 0)
+ error = VOP_IOCTL(invp, FIOSEEKDATA, &startoff, 0,
+ incred, curthread);
+ if (error == 0) {
+ endoff = startoff;
+ error = VOP_IOCTL(invp, FIOSEEKHOLE, &endoff, 0,
+ incred, curthread);
+ }
+ if (error == 0) {
+ if (startoff > *inoffp) {
+ /* Found hole before data block. */
+ xfer = MIN(startoff - *inoffp, len);
+ if (*outoffp < va.va_size) {
+ /* Must write 0s to punch hole. */
+ xfer2 = MIN(va.va_size - *outoffp,
+ xfer);
+ memset(dat, 0, MIN(xfer2, blksize));
+ error = vn_write_outvp(outvp, dat,
+ *outoffp, xfer2, blksize, false,
+ outcred);
+ }
+
+ if (error == 0 && *outoffp + xfer >
+ va.va_size && xfer == len)
+ /* Grow last block. */
+ error = vn_write_outvp(outvp, dat,
+ *outoffp, xfer, blksize, true,
+ outcred);
+ if (error == 0) {
+ *inoffp += xfer;
+ *outoffp += xfer;
+ len -= xfer;
+ }
+ }
+ copylen = MIN(len, endoff - startoff);
+ cantseek = false;
+ } else {
+ cantseek = true;
+ startoff = *inoffp;
+ copylen = len;
+ error = 0;
+ }
+
+ xfer = blksize;
+ if (cantseek) {
+ /*
+ * Set first xfer to end at a block boundary, so that
+ * holes are more likely detected in the loop below via
+ * the for all bytes 0 method.
+ */
+ xfer -= (*inoffp % blksize);
+ }
+ /* Loop copying the data block. */
+ while (copylen > 0 && error == 0) {
+ if (copylen < xfer)
+ xfer = copylen;
+ error = vn_lock(invp, LK_SHARED);
+ if (error != 0)
+ goto out;
+ error = vn_rdwr(UIO_READ, invp, dat, xfer,
+ startoff, UIO_SYSSPACE, IO_NODELOCKED,
+ curthread->td_ucred, incred, &aresid,
+ curthread);
+ VOP_UNLOCK(invp, 0);
+ /*
+ * Linux considers a range that exceeds EOF to
+ * be an error, so we will too.
+ */
+ if (error == 0 && aresid > 0)
+ error = EINVAL;
+ if (error == 0) {
+ /*
+ * Skip the write for holes past the initial EOF
+ * of the output file, unless this is the last
+ * write of the output file at EOF.
+ */
+ readzeros = cantseek ? mem_iszero(dat, xfer) :
+ false;
+ if (!cantseek || *outoffp < va.va_size ||
+ xfer == len || !readzeros)
+ error = vn_write_outvp(outvp, dat,
+ *outoffp, xfer, blksize,
+ readzeros && xfer == len &&
+ *outoffp >= va.va_size, outcred);
+ if (error == 0) {
+ *inoffp += xfer;
+ startoff += xfer;
+ *outoffp += xfer;
+ copylen -= xfer;
+ len -= xfer;
+ }
+ }
+ xfer = blksize;
+ }
+ }
+out:
+ *lenp = savlen - len;
+ free(dat, M_TEMP);
+ return (error);
+}
diff --git a/sys/kern/vnode_if.src b/sys/kern/vnode_if.src
index 68fa84079af6..dc0c08424ca9 100644
--- a/sys/kern/vnode_if.src
+++ b/sys/kern/vnode_if.src
@@ -718,6 +718,22 @@ vop_fdatasync {
};
+%% copy_file_range invp U U U
+%% copy_file_range outvp U U U
+
+vop_copy_file_range {
+ IN struct vnode *invp;
+ INOUT off_t *inoffp;
+ IN struct vnode *outvp;
+ INOUT off_t *outoffp;
+ INOUT size_t *lenp;
+ IN unsigned int flags;
+ IN struct ucred *incred;
+ IN struct ucred *outcred;
+ IN struct thread *fsizetd;
+};
+
+
# The VOPs below are spares at the end of the table to allow new VOPs to be
# added in stable branches without breaking the KBI. New VOPs in HEAD should
# be added above these spares. When merging a new VOP to a stable branch,
diff --git a/sys/sys/syscallsubr.h b/sys/sys/syscallsubr.h
index 0c3ec79071d8..beb6e503ebc7 100644
--- a/sys/sys/syscallsubr.h
+++ b/sys/sys/syscallsubr.h
@@ -94,6 +94,8 @@ int kern_clock_settime(struct thread *td, clockid_t clock_id,
int kern_close(struct thread *td, int fd);
int kern_connectat(struct thread *td, int dirfd, int fd,
struct sockaddr *sa);
+int kern_copy_file_range(struct thread *td, int infd, off_t *inoffp,
+ int outfd, off_t *outoffp, size_t len, unsigned int flags);
int kern_cpuset_getaffinity(struct thread *td, cpulevel_t level,
cpuwhich_t which, id_t id, size_t cpusetsize, cpuset_t *maskp);
int kern_cpuset_setaffinity(struct thread *td, cpulevel_t level,
diff --git a/sys/sys/vnode.h b/sys/sys/vnode.h
index 92615c781bb0..faf1f1ac477a 100644
--- a/sys/sys/vnode.h
+++ b/sys/sys/vnode.h
@@ -667,9 +667,17 @@ int vn_bmap_seekhole(struct vnode *vp, u_long cmd, off_t *off,
struct ucred *cred);
int vn_close(struct vnode *vp,
int flags, struct ucred *file_cred, struct thread *td);
+int vn_copy_file_range(struct vnode *invp, off_t *inoffp,
+ struct vnode *outvp, off_t *outoffp, size_t *lenp,
+ unsigned int flags, struct ucred *incred, struct ucred *outcred,
+ struct thread *fsize_td);
void vn_finished_write(struct mount *mp);
void vn_finished_secondary_write(struct mount *mp);
int vn_fsync_buf(struct vnode *vp, int waitfor);
+int vn_generic_copy_file_range(struct vnode *invp, off_t *inoffp,
+ struct vnode *outvp, off_t *outoffp, size_t *lenp,
+ unsigned int flags, struct ucred *incred, struct ucred *outcred,
+ struct thread *fsize_td);
int vn_isdisk(struct vnode *vp, int *errp);
int _vn_lock(struct vnode *vp, int flags, char *file, int line);
#define vn_lock(vp, flags) _vn_lock(vp, flags, __FILE__, __LINE__)