diff options
| author | Alan Somers <asomers@FreeBSD.org> | 2021-01-02 23:34:20 +0000 | 
|---|---|---|
| committer | Alan Somers <asomers@FreeBSD.org> | 2021-01-03 02:57:58 +0000 | 
| commit | 022ca2fc7fe08d51f33a1d23a9be49e6d132914e (patch) | |
| tree | 3b757d6a22fe9ab1d6a9b8c7e98ee9c85b382877 | |
| parent | 486580c44ce29c1e3b1d9b858a08d9df9428b699 (diff) | |
| -rw-r--r-- | lib/libc/sys/Makefile.inc | 2 | ||||
| -rw-r--r-- | lib/libc/sys/Symbol.map | 2 | ||||
| -rw-r--r-- | lib/libc/sys/aio_error.2 | 6 | ||||
| -rw-r--r-- | lib/libc/sys/aio_read.2 | 63 | ||||
| -rw-r--r-- | lib/libc/sys/aio_return.2 | 4 | ||||
| -rw-r--r-- | lib/libc/sys/aio_write.2 | 63 | ||||
| -rw-r--r-- | share/man/man4/aio.4 | 4 | ||||
| -rw-r--r-- | sys/bsm/audit_kevents.h | 2 | ||||
| -rw-r--r-- | sys/compat/freebsd32/freebsd32_misc.c | 2 | ||||
| -rw-r--r-- | sys/compat/freebsd32/freebsd32_util.h | 2 | ||||
| -rw-r--r-- | sys/compat/freebsd32/syscalls.master | 6 | ||||
| -rw-r--r-- | sys/kern/capabilities.conf | 2 | ||||
| -rw-r--r-- | sys/kern/sys_socket.c | 35 | ||||
| -rw-r--r-- | sys/kern/syscalls.master | 12 | ||||
| -rw-r--r-- | sys/kern/vfs_aio.c | 497 | ||||
| -rw-r--r-- | sys/sys/aio.h | 22 | ||||
| -rw-r--r-- | tests/sys/aio/aio_test.c | 739 | 
17 files changed, 1171 insertions, 292 deletions
| diff --git a/lib/libc/sys/Makefile.inc b/lib/libc/sys/Makefile.inc index d43a59719563..82d16fb81b6b 100644 --- a/lib/libc/sys/Makefile.inc +++ b/lib/libc/sys/Makefile.inc @@ -354,6 +354,8 @@ MAN+=	sctp_generic_recvmsg.2 \  	write.2 \  	_umtx_op.2 +MLINKS+=aio_read.2 aio_readv.2 +MLINKS+=aio_write.2 aio_writev.2  MLINKS+=accept.2 accept4.2  MLINKS+=access.2 eaccess.2 \  	access.2 faccessat.2 diff --git a/lib/libc/sys/Symbol.map b/lib/libc/sys/Symbol.map index 9f0d3749ac01..847dd9cca987 100644 --- a/lib/libc/sys/Symbol.map +++ b/lib/libc/sys/Symbol.map @@ -403,6 +403,8 @@ FBSD_1.5 {  FBSD_1.6 {  	__sysctlbyname; +	aio_readv; +	aio_writev;  	close_range;  	copy_file_range;  	fhlink; diff --git a/lib/libc/sys/aio_error.2 b/lib/libc/sys/aio_error.2 index 030914616121..1ec6505a64aa 100644 --- a/lib/libc/sys/aio_error.2 +++ b/lib/libc/sys/aio_error.2 @@ -24,7 +24,7 @@  .\"  .\" $FreeBSD$  .\" -.Dd June 2, 1999 +.Dd January 2, 2021  .Dt AIO_ERROR 2  .Os  .Sh NAME @@ -52,7 +52,9 @@ is returned.  If the request has completed unsuccessfully the error  status is returned as described in  .Xr read 2 , +.Xr readv 2 ,  .Xr write 2 , +.Xr writev 2 ,  or  .Xr fsync 2 .  On failure, @@ -76,9 +78,11 @@ does not reference an outstanding asynchronous I/O request.  .Sh SEE ALSO  .Xr aio_cancel 2 ,  .Xr aio_read 2 , +.Xr aio_readv 2 ,  .Xr aio_return 2 ,  .Xr aio_suspend 2 ,  .Xr aio_write 2 , +.Xr aio_writev 2 ,  .Xr fsync 2 ,  .Xr read 2 ,  .Xr write 2 , diff --git a/lib/libc/sys/aio_read.2 b/lib/libc/sys/aio_read.2 index bbf96cc89890..0327ef1f747b 100644 --- a/lib/libc/sys/aio_read.2 +++ b/lib/libc/sys/aio_read.2 @@ -24,11 +24,12 @@  .\"  .\" $FreeBSD$  .\" -.Dd August 19, 2016 +.Dd January 2, 2021  .Dt AIO_READ 2  .Os  .Sh NAME -.Nm aio_read +.Nm aio_read , +.Nm aio_readv  .Nd asynchronous read from a file (REALTIME)  .Sh LIBRARY  .Lb libc @@ -36,21 +37,42 @@  .In aio.h  .Ft int  .Fn aio_read "struct aiocb *iocb" +.In sys/uio.h +.Ft int +.Fn aio_readv "struct aiocb *iocb"  .Sh DESCRIPTION  The  .Fn aio_read -system call allows the calling process to read -.Fa iocb->aio_nbytes +and +.Fn aio_readv +system calls allow the calling process to read  from the descriptor  .Fa iocb->aio_fildes  beginning at the offset -.Fa iocb->aio_offset -into the buffer pointed to by -.Fa iocb->aio_buf . -The call returns immediately after the read request has +.Fa iocb->aio_offset . +.Fn aio_read +will read +.Fa iocb->aio_nbytes +from the buffer pointed to by +.Fa iocb->aio_buf , +whereas +.Fn aio_readv +reads the data into the +.Fa iocb->aio_iovcnt +buffers specified by the members of the +.Fa iocb->aio_iov +array. +Both syscalls return immediately after the read request has  been enqueued to the descriptor; the read may or may not have  completed at the time the call returns.  .Pp +For +.Fn aio_readv +the +.Fa iovec +structure is defined in +.Xr readv 2 . +.Pp  If _POSIX_PRIORITIZED_IO is defined, and the descriptor supports it,  then the enqueued operation is submitted at a priority equal to that  of the calling process minus @@ -61,7 +83,9 @@ The  argument  is ignored by the  .Fn aio_read -system call. +and +.Fn aio_readv +system calls.  .Pp  The  .Fa iocb @@ -108,16 +132,22 @@ is past the offset maximum for  .Fa iocb->aio_fildes ,  no I/O will occur.  .Sh RETURN VALUES -.Rv -std aio_read +.Rv -std aio_read aio_readv  .Sh DIAGNOSTICS  None.  .Sh ERRORS  The  .Fn aio_read -system call will fail if: +and +.Fn aio_readv +system calls will fail if:  .Bl -tag -width Er  .It Bq Er EAGAIN  The request was not queued because of system resource limitations. +.It Bq Er EFAULT +Part of +.Fa aio_iov +points outside the process's allocated address space.  .It Bq Er EINVAL  The asynchronous notification method in  .Fa iocb->aio_sigevent.sigev_notify @@ -130,10 +160,14 @@ are unsafe and unsafe asynchronous I/O operations are disabled.  .Pp  The following conditions may be synchronously detected when the  .Fn aio_read +or +.Fn aio_readv  system call is made, or asynchronously, at any time thereafter.  If they  are detected at call time,  .Fn aio_read +or +.Fn aio_readv  returns -1 and sets  .Va errno  appropriately; otherwise the @@ -207,11 +241,18 @@ The  system call is expected to conform to the  .St -p1003.1  standard. +The +.Fn aio_readv +system call is a FreeBSD extension, and should not be used in portable code.  .Sh HISTORY  The  .Fn aio_read  system call first appeared in  .Fx 3.0 . +The +.Fn aio_readv +system call first appeared in +.Fx 13.0 .  .Sh AUTHORS  This  manual page was written by diff --git a/lib/libc/sys/aio_return.2 b/lib/libc/sys/aio_return.2 index df558734ed41..d94fcc7eba62 100644 --- a/lib/libc/sys/aio_return.2 +++ b/lib/libc/sys/aio_return.2 @@ -24,7 +24,7 @@  .\"  .\" $FreeBSD$  .\" -.Dd March 21, 2016 +.Dd January 2, 2021  .Dt AIO_RETURN 2  .Os  .Sh NAME @@ -55,7 +55,9 @@ returns something other than  If the asynchronous I/O request has completed, the status is returned  as described in  .Xr read 2 , +.Xr readv 2 ,  .Xr write 2 , +.Xr writev 2 ,  or  .Xr fsync 2 .  Otherwise, diff --git a/lib/libc/sys/aio_write.2 b/lib/libc/sys/aio_write.2 index a3268e50ea90..601515b0e7b0 100644 --- a/lib/libc/sys/aio_write.2 +++ b/lib/libc/sys/aio_write.2 @@ -24,11 +24,12 @@  .\"  .\" $FreeBSD$  .\" -.Dd August 19, 2016 +.Dd January 2, 2021  .Dt AIO_WRITE 2  .Os  .Sh NAME -.Nm aio_write +.Nm aio_write , +.Nm aio_writev  .Nd asynchronous write to a file (REALTIME)  .Sh LIBRARY  .Lb libc @@ -36,28 +37,48 @@  .In aio.h  .Ft int  .Fn aio_write "struct aiocb *iocb" +.In sys/uio.h +.Ft int +.Fn aio_writev "struct aiocb *iocb"  .Sh DESCRIPTION  The  .Fn aio_write -system call allows the calling process to write -.Fa iocb->aio_nbytes -from the buffer pointed to by -.Fa iocb->aio_buf +and +.Fn aio_writev +system calls allow the calling process to write  to the descriptor  .Fa iocb->aio_fildes . -The call returns immediately after the write request has been enqueued +.Fn aio_write +will write +.Fa iocb->aio_nbytes +from the buffer pointed to by +.Fa iocb->aio_buf , +whereas +.Fn aio_writev +gathers the data from the +.Fa iocb->aio_iovcnt +buffers specified by the members of the +.Fa iocb->aio_iov +array. +Both syscalls return immediately after the write request has been enqueued  to the descriptor; the write may or may not have completed at the time  the call returns.  If the request could not be enqueued, generally due  to invalid arguments, the call returns without having enqueued the  request.  .Pp +For +.Fn aio_writev +the +.Fa iovec +structure is defined in +.Xr writev 2 . +.Pp  If  .Dv O_APPEND  is set for  .Fa iocb->aio_fildes , -.Fn aio_write -operations append to the file in the same order as the calls were +write operations append to the file in the same order as the calls were  made.  If  .Dv O_APPEND @@ -103,6 +124,8 @@ The asynchronous I/O control buffer  .Fa iocb  should be zeroed before the  .Fn aio_write +or +.Fn aio_writev  system call to avoid passing bogus context information to the kernel.  .Pp  Modifications of the Asynchronous I/O Control Block structure or the @@ -114,14 +137,20 @@ is past the offset maximum for  .Fa iocb->aio_fildes ,  no I/O will occur.  .Sh RETURN VALUES -.Rv -std aio_write +.Rv -std aio_write aio_writev  .Sh ERRORS  The  .Fn aio_write -system call will fail if: +and +.Fn aio_writev +system calls will fail if:  .Bl -tag -width Er  .It Bq Er EAGAIN  The request was not queued because of system resource limitations. +.It Bq Er EFAULT +Part of +.Fa aio_iov +points outside the process's allocated address space.  .It Bq Er EINVAL  The asynchronous notification method in  .Fa iocb->aio_sigevent.sigev_notify @@ -134,10 +163,14 @@ are unsafe and unsafe asynchronous I/O operations are disabled.  .Pp  The following conditions may be synchronously detected when the  .Fn aio_write +or +.Fn aio_writev  system call is made, or asynchronously, at any time thereafter.  If they  are detected at call time,  .Fn aio_write +or +.Fn aio_writev  returns -1 and sets  .Va errno  appropriately; otherwise the @@ -203,11 +236,19 @@ system call  is expected to conform to the  .St -p1003.1  standard. +.Pp +The +.Fn aio_writev +system call is a FreeBSD extension, and should not be used in portable code.  .Sh HISTORY  The  .Fn aio_write  system call first appeared in  .Fx 3.0 . +The +.Fn aio_writev +system call first appeared in +.Fx 13.0 .  .Sh AUTHORS  This manual page was written by  .An Wes Peters Aq Mt wes@softweyr.com . diff --git a/share/man/man4/aio.4 b/share/man/man4/aio.4 index 0ea728499d13..513a5728defc 100644 --- a/share/man/man4/aio.4 +++ b/share/man/man4/aio.4 @@ -27,7 +27,7 @@  .\"  .\" $FreeBSD$  .\" -.Dd June 22, 2017 +.Dd January 2, 2021  .Dt AIO 4  .Os  .Sh NAME @@ -215,10 +215,12 @@ as described in  .Xr aio_cancel 2 ,  .Xr aio_error 2 ,  .Xr aio_read 2 , +.Xr aio_readv 2 ,  .Xr aio_return 2 ,  .Xr aio_suspend 2 ,  .Xr aio_waitcomplete 2 ,  .Xr aio_write 2 , +.Xr aio_writev 2 ,  .Xr lio_listio 2 ,  .Xr sigevent 3 ,  .Xr sysctl 8 diff --git a/sys/bsm/audit_kevents.h b/sys/bsm/audit_kevents.h index 5b37329078a1..eeb928ecafdc 100644 --- a/sys/bsm/audit_kevents.h +++ b/sys/bsm/audit_kevents.h @@ -660,6 +660,8 @@  #define	AUE_REALPATHAT		43264	/* FreeBSD-specific. */  #define	AUE_CLOSERANGE		43265	/* FreeBSD-specific. */  #define	AUE_SPECIALFD		43266	/* FreeBSD-specific. */ +#define	AUE_AIO_WRITEV		43267	/* FreeBSD-specific. */ +#define	AUE_AIO_READV		43268	/* FreeBSD-specific. */  /*   * Darwin BSM uses a number of AUE_O_* definitions, which are aliased to the diff --git a/sys/compat/freebsd32/freebsd32_misc.c b/sys/compat/freebsd32/freebsd32_misc.c index 62fab95c68d1..14afd433d9f1 100644 --- a/sys/compat/freebsd32/freebsd32_misc.c +++ b/sys/compat/freebsd32/freebsd32_misc.c @@ -1070,7 +1070,7 @@ freebsd32_ptrace(struct thread *td, struct freebsd32_ptrace_args *uap)  	return (error);  } -static int +int  freebsd32_copyinuio(struct iovec32 *iovp, u_int iovcnt, struct uio **uiop)  {  	struct iovec32 iov32; diff --git a/sys/compat/freebsd32/freebsd32_util.h b/sys/compat/freebsd32/freebsd32_util.h index a66038d4d36a..b126fbde0857 100644 --- a/sys/compat/freebsd32/freebsd32_util.h +++ b/sys/compat/freebsd32/freebsd32_util.h @@ -116,6 +116,8 @@ int	freebsd32_copyout_strings(struct image_params *imgp,  	    uintptr_t *stack_base);  int	freebsd32_copyiniov(struct iovec32 *iovp, u_int iovcnt,  	    struct iovec **iov, int error); +int	freebsd32_copyinuio(struct iovec32 *iovp, u_int iovcnt, +	    struct uio **uiop);  void	freebsd32_rusage_out(const struct rusage *s, struct rusage32 *s32);  struct image_args; diff --git a/sys/compat/freebsd32/syscalls.master b/sys/compat/freebsd32/syscalls.master index f4339795781a..ca0db9a76b1e 100644 --- a/sys/compat/freebsd32/syscalls.master +++ b/sys/compat/freebsd32/syscalls.master @@ -493,8 +493,10 @@  257	AUE_LIO_LISTIO	STD	{ int freebsd32_lio_listio(int mode, \  				    struct aiocb32 * const *acb_list, \  				    int nent, struct sigevent32 *sig); } -258	AUE_NULL	UNIMPL	nosys -259	AUE_NULL	UNIMPL	nosys +258	AUE_AIO_WRITEV	STD	{ int freebsd32_aio_writev( \ +				    struct aiocb32 *aiocbp); } +259	AUE_AIO_READV	STD	{ int freebsd32_aio_readv( \ +				    struct aiocb32 *aiocbp); }  260	AUE_NULL	UNIMPL	nosys  261	AUE_NULL	UNIMPL	nosys  262	AUE_NULL	UNIMPL	nosys diff --git a/sys/kern/capabilities.conf b/sys/kern/capabilities.conf index 3d552255d823..602ec7088fc6 100644 --- a/sys/kern/capabilities.conf +++ b/sys/kern/capabilities.conf @@ -100,6 +100,8 @@ aio_return  aio_suspend  aio_waitcomplete  aio_write +aio_writev +aio_readv  ##  ## audit(2) is a global operation, submitting to the global trail, but it is diff --git a/sys/kern/sys_socket.c b/sys/kern/sys_socket.c index 0fe200c119d2..18803b6a5ac0 100644 --- a/sys/kern/sys_socket.c +++ b/sys/kern/sys_socket.c @@ -600,9 +600,7 @@ soaio_process_job(struct socket *so, struct sockbuf *sb, struct kaiocb *job)  	struct ucred *td_savedcred;  	struct thread *td;  	struct file *fp; -	struct uio uio; -	struct iovec iov; -	size_t cnt, done; +	size_t cnt, done, job_total_nbytes;  	long ru_before;  	int error, flags; @@ -614,16 +612,11 @@ retry:  	td_savedcred = td->td_ucred;  	td->td_ucred = job->cred; +	job_total_nbytes = job->uiop->uio_resid + job->aio_done;  	done = job->aio_done; -	cnt = job->uaiocb.aio_nbytes - done; -	iov.iov_base = (void *)((uintptr_t)job->uaiocb.aio_buf + done); -	iov.iov_len = cnt; -	uio.uio_iov = &iov; -	uio.uio_iovcnt = 1; -	uio.uio_offset = 0; -	uio.uio_resid = cnt; -	uio.uio_segflg = UIO_USERSPACE; -	uio.uio_td = td; +	cnt = job->uiop->uio_resid; +	job->uiop->uio_offset = 0; +	job->uiop->uio_td = td;  	flags = MSG_NBIO;  	/* @@ -633,26 +626,26 @@ retry:  	 */  	if (sb == &so->so_rcv) { -		uio.uio_rw = UIO_READ;  		ru_before = td->td_ru.ru_msgrcv;  #ifdef MAC  		error = mac_socket_check_receive(fp->f_cred, so);  		if (error == 0)  #endif -			error = soreceive(so, NULL, &uio, NULL, NULL, &flags); +			error = soreceive(so, NULL, job->uiop, NULL, NULL, +			    &flags);  		if (td->td_ru.ru_msgrcv != ru_before)  			job->msgrcv = 1;  	} else {  		if (!TAILQ_EMPTY(&sb->sb_aiojobq))  			flags |= MSG_MORETOCOME; -		uio.uio_rw = UIO_WRITE;  		ru_before = td->td_ru.ru_msgsnd;  #ifdef MAC  		error = mac_socket_check_send(fp->f_cred, so);  		if (error == 0)  #endif -			error = sosend(so, NULL, &uio, NULL, NULL, flags, td); +			error = sosend(so, NULL, job->uiop, NULL, NULL, flags, +			    td);  		if (td->td_ru.ru_msgsnd != ru_before)  			job->msgsnd = 1;  		if (error == EPIPE && (so->so_options & SO_NOSIGPIPE) == 0) { @@ -662,7 +655,7 @@ retry:  		}  	} -	done += cnt - uio.uio_resid; +	done += cnt - job->uiop->uio_resid;  	job->aio_done = done;  	td->td_ucred = td_savedcred; @@ -676,7 +669,7 @@ retry:  		 * been made, requeue this request at the head of the  		 * queue to try again when the socket is ready.  		 */ -		MPASS(done != job->uaiocb.aio_nbytes); +		MPASS(done != job_total_nbytes);  		SOCKBUF_LOCK(sb);  		if (done == 0 || !(so->so_state & SS_NBIO)) {  			empty_results++; @@ -782,10 +775,10 @@ soo_aio_cancel(struct kaiocb *job)  	so = job->fd_file->f_data;  	opcode = job->uaiocb.aio_lio_opcode; -	if (opcode == LIO_READ) +	if (opcode == LIO_READ || opcode == LIO_READV)  		sb = &so->so_rcv;  	else { -		MPASS(opcode == LIO_WRITE); +		MPASS(opcode == LIO_WRITE || opcode == LIO_WRITEV);  		sb = &so->so_snd;  	} @@ -817,9 +810,11 @@ soo_aio_queue(struct file *fp, struct kaiocb *job)  	switch (job->uaiocb.aio_lio_opcode) {  	case LIO_READ: +	case LIO_READV:  		sb = &so->so_rcv;  		break;  	case LIO_WRITE: +	case LIO_WRITEV:  		sb = &so->so_snd;  		break;  	default: diff --git a/sys/kern/syscalls.master b/sys/kern/syscalls.master index b7ea5e939635..aaa0a1277461 100644 --- a/sys/kern/syscalls.master +++ b/sys/kern/syscalls.master @@ -1477,7 +1477,17 @@  		    _In_opt_ struct sigevent *sig  		);  	} -258-271	AUE_NULL	UNIMPL	nosys +258	AUE_AIO_WRITEV	STD { +		int aio_writev( +		    _Inout_ struct aiocb *aiocbp +		); +	} +259	AUE_AIO_READV	STD { +		int aio_readv( +		    _Inout_ struct aiocb *aiocbp +		); +	} +260-271	AUE_NULL	UNIMPL	nosys  272	AUE_O_GETDENTS	COMPAT11 {  		int getdents(  		    int fd, diff --git a/sys/kern/vfs_aio.c b/sys/kern/vfs_aio.c index 37e19557d807..d83c9d725e68 100644 --- a/sys/kern/vfs_aio.c +++ b/sys/kern/vfs_aio.c @@ -292,7 +292,7 @@ struct kaioinfo {   * Different ABIs provide their own operations.   */  struct aiocb_ops { -	int	(*aio_copyin)(struct aiocb *ujob, struct aiocb *kjob); +	int	(*aio_copyin)(struct aiocb *ujob, struct kaiocb *kjob, int ty);  	long	(*fetch_status)(struct aiocb *ujob);  	long	(*fetch_error)(struct aiocb *ujob);  	int	(*store_status)(struct aiocb *ujob, long status); @@ -307,6 +307,7 @@ static struct mtx aio_job_mtx;  static TAILQ_HEAD(,kaiocb) aio_jobs;			/* (c) Async job list */  static struct unrhdr *aiod_unr; +static void	aio_biocleanup(struct bio *bp);  void		aio_init_aioinfo(struct proc *p);  static int	aio_onceonly(void);  static int	aio_free_entry(struct kaiocb *job); @@ -559,6 +560,8 @@ aio_free_entry(struct kaiocb *job)  	if (job->fd_file)  		fdrop(job->fd_file, curthread);  	crfree(job->cred); +	if (job->uiop != &job->uio) +		free(job->uiop, M_IOV);  	uma_zfree(aiocb_zone, job);  	AIO_LOCK(ki); @@ -754,36 +757,29 @@ aio_process_rw(struct kaiocb *job)  	struct thread *td;  	struct aiocb *cb;  	struct file *fp; -	struct uio auio; -	struct iovec aiov;  	ssize_t cnt;  	long msgsnd_st, msgsnd_end;  	long msgrcv_st, msgrcv_end;  	long oublock_st, oublock_end;  	long inblock_st, inblock_end; -	int error; +	int error, opcode;  	KASSERT(job->uaiocb.aio_lio_opcode == LIO_READ || -	    job->uaiocb.aio_lio_opcode == LIO_WRITE, +	    job->uaiocb.aio_lio_opcode == LIO_READV || +	    job->uaiocb.aio_lio_opcode == LIO_WRITE || +	    job->uaiocb.aio_lio_opcode == LIO_WRITEV,  	    ("%s: opcode %d", __func__, job->uaiocb.aio_lio_opcode));  	aio_switch_vmspace(job);  	td = curthread;  	td_savedcred = td->td_ucred;  	td->td_ucred = job->cred; +	job->uiop->uio_td = td;  	cb = &job->uaiocb;  	fp = job->fd_file; -	aiov.iov_base = (void *)(uintptr_t)cb->aio_buf; -	aiov.iov_len = cb->aio_nbytes; - -	auio.uio_iov = &aiov; -	auio.uio_iovcnt = 1; -	auio.uio_offset = cb->aio_offset; -	auio.uio_resid = cb->aio_nbytes; -	cnt = cb->aio_nbytes; -	auio.uio_segflg = UIO_USERSPACE; -	auio.uio_td = td; +	opcode = job->uaiocb.aio_lio_opcode; +	cnt = job->uiop->uio_resid;  	msgrcv_st = td->td_ru.ru_msgrcv;  	msgsnd_st = td->td_ru.ru_msgsnd; @@ -794,17 +790,16 @@ aio_process_rw(struct kaiocb *job)  	 * aio_aqueue() acquires a reference to the file that is  	 * released in aio_free_entry().  	 */ -	if (cb->aio_lio_opcode == LIO_READ) { -		auio.uio_rw = UIO_READ; -		if (auio.uio_resid == 0) +	if (opcode == LIO_READ || opcode == LIO_READV) { +		if (job->uiop->uio_resid == 0)  			error = 0;  		else -			error = fo_read(fp, &auio, fp->f_cred, FOF_OFFSET, td); +			error = fo_read(fp, job->uiop, fp->f_cred, FOF_OFFSET, +			    td);  	} else {  		if (fp->f_type == DTYPE_VNODE)  			bwillwrite(); -		auio.uio_rw = UIO_WRITE; -		error = fo_write(fp, &auio, fp->f_cred, FOF_OFFSET, td); +		error = fo_write(fp, job->uiop, fp->f_cred, FOF_OFFSET, td);  	}  	msgrcv_end = td->td_ru.ru_msgrcv;  	msgsnd_end = td->td_ru.ru_msgsnd; @@ -816,17 +811,18 @@ aio_process_rw(struct kaiocb *job)  	job->inblock = inblock_end - inblock_st;  	job->outblock = oublock_end - oublock_st; -	if ((error) && (auio.uio_resid != cnt)) { +	if (error != 0 && job->uiop->uio_resid != cnt) {  		if (error == ERESTART || error == EINTR || error == EWOULDBLOCK)  			error = 0; -		if ((error == EPIPE) && (cb->aio_lio_opcode == LIO_WRITE)) { +		if (error == EPIPE && +		    (opcode == LIO_WRITE || opcode == LIO_WRITEV)) {  			PROC_LOCK(job->userproc);  			kern_psignal(job->userproc, SIGPIPE);  			PROC_UNLOCK(job->userproc);  		}  	} -	cnt -= auio.uio_resid; +	cnt -= job->uiop->uio_resid;  	td->td_ucred = td_savedcred;  	if (error)  		aio_complete(job, -1, error); @@ -1210,21 +1206,23 @@ aio_qbio(struct proc *p, struct kaiocb *job)  {  	struct aiocb *cb;  	struct file *fp; -	struct bio *bp;  	struct buf *pbuf;  	struct vnode *vp;  	struct cdevsw *csw;  	struct cdev *dev;  	struct kaioinfo *ki; -	struct vm_page **pages; -	int error, npages, poff, ref; +	struct bio **bios = NULL; +	off_t offset; +	int bio_cmd, error, i, iovcnt, opcode, poff, ref;  	vm_prot_t prot; +	bool use_unmapped;  	cb = &job->uaiocb;  	fp = job->fd_file; +	opcode = cb->aio_lio_opcode; -	if (!(cb->aio_lio_opcode == LIO_WRITE || -	    cb->aio_lio_opcode == LIO_READ)) +	if (!(opcode == LIO_WRITE || opcode == LIO_WRITEV || +	    opcode == LIO_READ || opcode == LIO_READV))  		return (-1);  	if (fp == NULL || fp->f_type != DTYPE_VNODE)  		return (-1); @@ -1234,8 +1232,21 @@ aio_qbio(struct proc *p, struct kaiocb *job)  		return (-1);  	if (vp->v_bufobj.bo_bsize == 0)  		return (-1); -	if (cb->aio_nbytes % vp->v_bufobj.bo_bsize) + +	bio_cmd = opcode == LIO_WRITE || opcode == LIO_WRITEV ? BIO_WRITE : +	    BIO_READ; +	iovcnt = job->uiop->uio_iovcnt; +	if (iovcnt > max_buf_aio)  		return (-1); +	for (i = 0; i < iovcnt; i++) { +		if (job->uiop->uio_iov[i].iov_len % vp->v_bufobj.bo_bsize != 0) +			return (-1); +		if (job->uiop->uio_iov[i].iov_len > maxphys) { +			error = -1; +			return (-1); +		} +	} +	offset = cb->aio_offset;  	ref = 0;  	csw = devvn_refthread(vp, &dev, &ref); @@ -1246,89 +1257,106 @@ aio_qbio(struct proc *p, struct kaiocb *job)  		error = -1;  		goto unref;  	} -	if (cb->aio_nbytes > dev->si_iosize_max) { +	if (job->uiop->uio_resid > dev->si_iosize_max) {  		error = -1;  		goto unref;  	}  	ki = p->p_aioinfo; -	poff = (vm_offset_t)cb->aio_buf & PAGE_MASK; -	if ((dev->si_flags & SI_UNMAPPED) && unmapped_buf_allowed) { -		if (cb->aio_nbytes > maxphys) { -			error = -1; -			goto unref; -		} +	job->error = 0; -		pbuf = NULL; -		pages = malloc(sizeof(vm_page_t) * (atop(round_page( -		    cb->aio_nbytes)) + 1), M_TEMP, M_WAITOK | M_ZERO); -	} else { -		if (cb->aio_nbytes > maxphys) { -			error = -1; -			goto unref; -		} -		if (ki->kaio_buffer_count >= max_buf_aio) { +	use_unmapped = (dev->si_flags & SI_UNMAPPED) && unmapped_buf_allowed; +	if (!use_unmapped) { +		AIO_LOCK(ki); +		if (ki->kaio_buffer_count + iovcnt > max_buf_aio) { +			AIO_UNLOCK(ki);  			error = EAGAIN;  			goto unref;  		} - -		pbuf = uma_zalloc(pbuf_zone, M_WAITOK); -		BUF_KERNPROC(pbuf); -		AIO_LOCK(ki); -		ki->kaio_buffer_count++; +		ki->kaio_buffer_count += iovcnt;  		AIO_UNLOCK(ki); -		pages = pbuf->b_pages; -	} -	bp = g_alloc_bio(); - -	bp->bio_length = cb->aio_nbytes; -	bp->bio_bcount = cb->aio_nbytes; -	bp->bio_done = aio_biowakeup; -	bp->bio_offset = cb->aio_offset; -	bp->bio_cmd = cb->aio_lio_opcode == LIO_WRITE ? BIO_WRITE : BIO_READ; -	bp->bio_dev = dev; -	bp->bio_caller1 = job; -	bp->bio_caller2 = pbuf; - -	prot = VM_PROT_READ; -	if (cb->aio_lio_opcode == LIO_READ) -		prot |= VM_PROT_WRITE;	/* Less backwards than it looks */ -	npages = vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map, -	    (vm_offset_t)cb->aio_buf, bp->bio_length, prot, pages, -	    atop(maxphys) + 1); -	if (npages < 0) { -		error = EFAULT; -		goto doerror;  	} -	if (pbuf != NULL) { -		pmap_qenter((vm_offset_t)pbuf->b_data, pages, npages); -		bp->bio_data = pbuf->b_data + poff; -		atomic_add_int(&num_buf_aio, 1); -		pbuf->b_npages = npages; -	} else { -		bp->bio_ma = pages; -		bp->bio_ma_n = npages; -		bp->bio_ma_offset = poff; -		bp->bio_data = unmapped_buf; -		bp->bio_flags |= BIO_UNMAPPED; -		atomic_add_int(&num_unmapped_aio, 1); + +	bios = malloc(sizeof(struct bio *) * iovcnt, M_TEMP, M_WAITOK); +	atomic_store_int(&job->nbio, iovcnt); +	for (i = 0; i < iovcnt; i++) { +		struct vm_page** pages; +		struct bio *bp; +		void *buf; +		size_t nbytes; +		int npages; + +		buf = job->uiop->uio_iov[i].iov_base; +		nbytes = job->uiop->uio_iov[i].iov_len; + +		bios[i] = g_alloc_bio(); +		bp = bios[i]; + +		poff = (vm_offset_t)buf & PAGE_MASK; +		if (use_unmapped) { +			pbuf = NULL; +			pages = malloc(sizeof(vm_page_t) * (atop(round_page( +			    nbytes)) + 1), M_TEMP, M_WAITOK | M_ZERO); +		} else { +			pbuf = uma_zalloc(pbuf_zone, M_WAITOK); +			BUF_KERNPROC(pbuf); +			pages = pbuf->b_pages; +		} + +		bp->bio_length = nbytes; +		bp->bio_bcount = nbytes; +		bp->bio_done = aio_biowakeup; +		bp->bio_offset = offset; +		bp->bio_cmd = bio_cmd; +		bp->bio_dev = dev; +		bp->bio_caller1 = job; +		bp->bio_caller2 = pbuf; + +		prot = VM_PROT_READ; +		if (opcode == LIO_READ || opcode == LIO_READV) +			prot |= VM_PROT_WRITE;	/* Less backwards than it looks */ +		npages = vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map, +		    (vm_offset_t)buf, bp->bio_length, prot, pages, +		    atop(maxphys) + 1); +		if (npages < 0) { +			if (pbuf != NULL) +				uma_zfree(pbuf_zone, pbuf); +			else +				free(pages, M_TEMP); +			error = EFAULT; +			g_destroy_bio(bp); +			i--; +			goto destroy_bios; +		} +		if (pbuf != NULL) { +			pmap_qenter((vm_offset_t)pbuf->b_data, pages, npages); +			bp->bio_data = pbuf->b_data + poff; +			pbuf->b_npages = npages; +			atomic_add_int(&num_buf_aio, 1); +		} else { +			bp->bio_ma = pages; +			bp->bio_ma_n = npages; +			bp->bio_ma_offset = poff; +			bp->bio_data = unmapped_buf; +			bp->bio_flags |= BIO_UNMAPPED; +			atomic_add_int(&num_unmapped_aio, 1); +		} + +		offset += nbytes;  	}  	/* Perform transfer. */ -	csw->d_strategy(bp); +	for (i = 0; i < iovcnt; i++) +		csw->d_strategy(bios[i]); +	free(bios, M_TEMP); +  	dev_relthread(dev, ref);  	return (0); -doerror: -	if (pbuf != NULL) { -		AIO_LOCK(ki); -		ki->kaio_buffer_count--; -		AIO_UNLOCK(ki); -		uma_zfree(pbuf_zone, pbuf); -	} else { -		free(pages, M_TEMP); -	} -	g_destroy_bio(bp); +destroy_bios: +	for (; i >= 0; i--) +		aio_biocleanup(bios[i]); +	free(bios, M_TEMP);  unref:  	dev_relthread(dev, ref);  	return (error); @@ -1362,25 +1390,39 @@ convert_old_sigevent(struct osigevent *osig, struct sigevent *nsig)  }  static int -aiocb_copyin_old_sigevent(struct aiocb *ujob, struct aiocb *kjob) +aiocb_copyin_old_sigevent(struct aiocb *ujob, struct kaiocb *kjob, +    int type __unused)  {  	struct oaiocb *ojob; +	struct aiocb *kcb = &kjob->uaiocb;  	int error; -	bzero(kjob, sizeof(struct aiocb)); -	error = copyin(ujob, kjob, sizeof(struct oaiocb)); +	bzero(kcb, sizeof(struct aiocb)); +	error = copyin(ujob, kcb, sizeof(struct oaiocb));  	if (error)  		return (error); -	ojob = (struct oaiocb *)kjob; -	return (convert_old_sigevent(&ojob->aio_sigevent, &kjob->aio_sigevent)); +	/* No need to copyin aio_iov, because it did not exist in FreeBSD 6 */ +	ojob = (struct oaiocb *)kcb; +	return (convert_old_sigevent(&ojob->aio_sigevent, &kcb->aio_sigevent));  }  #endif  static int -aiocb_copyin(struct aiocb *ujob, struct aiocb *kjob) +aiocb_copyin(struct aiocb *ujob, struct kaiocb *kjob, int type)  { +	struct aiocb *kcb = &kjob->uaiocb; +	int error; + +	error = copyin(ujob, kcb, sizeof(struct aiocb)); +	if (error) +		return (error); +	if (type == LIO_READV || type == LIO_WRITEV) { +		/* malloc a uio and copy in the iovec */ +		error = copyinuio(__DEVOLATILE(struct iovec*, kcb->aio_iov), +		    kcb->aio_iovcnt, &kjob->uiop); +	} -	return (copyin(ujob, kjob, sizeof(struct aiocb))); +	return (error);  }  static long @@ -1456,7 +1498,7 @@ aio_aqueue(struct thread *td, struct aiocb *ujob, struct aioliojob *lj,      int type, struct aiocb_ops *ops)  {  	struct proc *p = td->td_proc; -	struct file *fp; +	struct file *fp = NULL;  	struct kaiocb *job;  	struct kaioinfo *ki;  	struct kevent kev; @@ -1477,39 +1519,35 @@ aio_aqueue(struct thread *td, struct aiocb *ujob, struct aioliojob *lj,  	if (num_queue_count >= max_queue_count ||  	    ki->kaio_count >= max_aio_queue_per_proc) { -		ops->store_error(ujob, EAGAIN); -		return (EAGAIN); +		error = EAGAIN; +		goto err1;  	}  	job = uma_zalloc(aiocb_zone, M_WAITOK | M_ZERO);  	knlist_init_mtx(&job->klist, AIO_MTX(ki)); -	error = ops->aio_copyin(ujob, &job->uaiocb); -	if (error) { -		ops->store_error(ujob, error); -		uma_zfree(aiocb_zone, job); -		return (error); -	} +	error = ops->aio_copyin(ujob, job, type); +	if (error) +		goto err2;  	if (job->uaiocb.aio_nbytes > IOSIZE_MAX) { -		uma_zfree(aiocb_zone, job); -		return (EINVAL); +		error = EINVAL; +		goto err2;  	}  	if (job->uaiocb.aio_sigevent.sigev_notify != SIGEV_KEVENT &&  	    job->uaiocb.aio_sigevent.sigev_notify != SIGEV_SIGNAL &&  	    job->uaiocb.aio_sigevent.sigev_notify != SIGEV_THREAD_ID &&  	    job->uaiocb.aio_sigevent.sigev_notify != SIGEV_NONE) { -		ops->store_error(ujob, EINVAL); -		uma_zfree(aiocb_zone, job); -		return (EINVAL); +		error = EINVAL; +		goto err2;  	}  	if ((job->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL ||  	     job->uaiocb.aio_sigevent.sigev_notify == SIGEV_THREAD_ID) &&  		!_SIG_VALID(job->uaiocb.aio_sigevent.sigev_signo)) { -		uma_zfree(aiocb_zone, job); -		return (EINVAL); +		error = EINVAL; +		goto err2;  	}  	ksiginfo_init(&job->ksi); @@ -1533,16 +1571,17 @@ aio_aqueue(struct thread *td, struct aiocb *ujob, struct aioliojob *lj,  	fd = job->uaiocb.aio_fildes;  	switch (opcode) {  	case LIO_WRITE: +	case LIO_WRITEV:  		error = fget_write(td, fd, &cap_pwrite_rights, &fp);  		break;  	case LIO_READ: +	case LIO_READV:  		error = fget_read(td, fd, &cap_pread_rights, &fp);  		break;  	case LIO_SYNC:  		error = fget(td, fd, &cap_fsync_rights, &fp);  		break;  	case LIO_MLOCK: -		fp = NULL;  		break;  	case LIO_NOP:  		error = fget(td, fd, &cap_no_rights, &fp); @@ -1550,22 +1589,20 @@ aio_aqueue(struct thread *td, struct aiocb *ujob, struct aioliojob *lj,  	default:  		error = EINVAL;  	} -	if (error) { -		uma_zfree(aiocb_zone, job); -		ops->store_error(ujob, error); -		return (error); -	} +	if (error) +		goto err3;  	if (opcode == LIO_SYNC && fp->f_vnode == NULL) {  		error = EINVAL; -		goto aqueue_fail; +		goto err3;  	} -	if ((opcode == LIO_READ || opcode == LIO_WRITE) && +	if ((opcode == LIO_READ || opcode == LIO_READV || +	    opcode == LIO_WRITE || opcode == LIO_WRITEV) &&  	    job->uaiocb.aio_offset < 0 &&  	    (fp->f_vnode == NULL || fp->f_vnode->v_type != VCHR)) {  		error = EINVAL; -		goto aqueue_fail; +		goto err3;  	}  	job->fd_file = fp; @@ -1577,12 +1614,13 @@ aio_aqueue(struct thread *td, struct aiocb *ujob, struct aioliojob *lj,  	error = ops->store_kernelinfo(ujob, jid);  	if (error) {  		error = EINVAL; -		goto aqueue_fail; +		goto err3;  	}  	job->uaiocb._aiocb_private.kernelinfo = (void *)(intptr_t)jid;  	if (opcode == LIO_NOP) {  		fdrop(fp, td); +		MPASS(job->uiop == &job->uio || job->uiop == NULL);  		uma_zfree(aiocb_zone, job);  		return (0);  	} @@ -1592,7 +1630,7 @@ aio_aqueue(struct thread *td, struct aiocb *ujob, struct aioliojob *lj,  	evflags = job->uaiocb.aio_sigevent.sigev_notify_kevent_flags;  	if ((evflags & ~(EV_CLEAR | EV_DISPATCH | EV_ONESHOT)) != 0) {  		error = EINVAL; -		goto aqueue_fail; +		goto err3;  	}  	kqfd = job->uaiocb.aio_sigevent.sigev_notify_kqueue;  	memset(&kev, 0, sizeof(kev)); @@ -1603,7 +1641,7 @@ aio_aqueue(struct thread *td, struct aiocb *ujob, struct aioliojob *lj,  	kev.udata = job->uaiocb.aio_sigevent.sigev_value.sival_ptr;  	error = kqfd_register(kqfd, &kev, td, M_WAITOK);  	if (error) -		goto aqueue_fail; +		goto err3;  no_kqueue: @@ -1614,6 +1652,39 @@ no_kqueue:  	job->jobflags = KAIOCB_QUEUEING;  	job->lio = lj; +	switch (opcode) { +	case LIO_READV: +	case LIO_WRITEV: +		/* Use the uio copied in by aio_copyin */ +		MPASS(job->uiop != &job->uio && job->uiop != NULL); +		break; +	case LIO_READ: +	case LIO_WRITE: +		/* Setup the inline uio */ +		job->iov[0].iov_base = (void *)(uintptr_t)job->uaiocb.aio_buf; +		job->iov[0].iov_len = job->uaiocb.aio_nbytes; +		job->uio.uio_iov = job->iov; +		job->uio.uio_iovcnt = 1; +		job->uio.uio_resid = job->uaiocb.aio_nbytes; +		job->uio.uio_segflg = UIO_USERSPACE; +		/* FALLTHROUGH */ +	default: +		job->uiop = &job->uio; +		break; +	} +	switch (opcode) { +	case LIO_READ: +	case LIO_READV: +		job->uiop->uio_rw = UIO_READ; +		break; +	case LIO_WRITE: +	case LIO_WRITEV: +		job->uiop->uio_rw = UIO_WRITE; +		break; +	} +	job->uiop->uio_offset = job->uaiocb.aio_offset; +	job->uiop->uio_td = td; +  	if (opcode == LIO_MLOCK) {  		aio_schedule(job, aio_process_mlock);  		error = 0; @@ -1622,7 +1693,7 @@ no_kqueue:  	else  		error = fo_aio_queue(fp, job);  	if (error) -		goto aqueue_fail; +		goto err3;  	AIO_LOCK(ki);  	job->jobflags &= ~KAIOCB_QUEUEING; @@ -1643,11 +1714,15 @@ no_kqueue:  	AIO_UNLOCK(ki);  	return (0); -aqueue_fail: -	knlist_delete(&job->klist, curthread, 0); +err3:  	if (fp)  		fdrop(fp, td); +	knlist_delete(&job->klist, curthread, 0); +err2: +	if (job->uiop != &job->uio) +		free(job->uiop, M_IOV);  	uma_zfree(aiocb_zone, job); +err1:  	ops->store_error(ujob, error);  	return (error);  } @@ -1723,7 +1798,9 @@ aio_queue_file(struct file *fp, struct kaiocb *job)  	switch (job->uaiocb.aio_lio_opcode) {  	case LIO_READ: +	case LIO_READV:  	case LIO_WRITE: +	case LIO_WRITEV:  		aio_schedule(job, aio_process_rw);  		error = 0;  		break; @@ -2097,6 +2174,13 @@ sys_aio_read(struct thread *td, struct aio_read_args *uap)  	return (aio_aqueue(td, uap->aiocbp, NULL, LIO_READ, &aiocb_ops));  } +int +sys_aio_readv(struct thread *td, struct aio_readv_args *uap) +{ + +	return (aio_aqueue(td, uap->aiocbp, NULL, LIO_READV, &aiocb_ops)); +} +  /* syscall - asynchronous write to a file (REALTIME) */  #ifdef COMPAT_FREEBSD6  int @@ -2116,6 +2200,13 @@ sys_aio_write(struct thread *td, struct aio_write_args *uap)  }  int +sys_aio_writev(struct thread *td, struct aio_writev_args *uap) +{ + +	return (aio_aqueue(td, uap->aiocbp, NULL, LIO_WRITEV, &aiocb_ops)); +} + +int  sys_aio_mlock(struct thread *td, struct aio_mlock_args *uap)  { @@ -2337,13 +2428,11 @@ sys_lio_listio(struct thread *td, struct lio_listio_args *uap)  }  static void -aio_biowakeup(struct bio *bp) +aio_biocleanup(struct bio *bp)  {  	struct kaiocb *job = (struct kaiocb *)bp->bio_caller1;  	struct kaioinfo *ki; -	struct buf *pbuf = (struct buf*)bp->bio_caller2; -	size_t nbytes; -	int error, nblks; +	struct buf *pbuf = (struct buf *)bp->bio_caller2;  	/* Release mapping into kernel space. */  	if (pbuf != NULL) { @@ -2362,23 +2451,47 @@ aio_biowakeup(struct bio *bp)  		free(bp->bio_ma, M_TEMP);  		atomic_subtract_int(&num_unmapped_aio, 1);  	} +	g_destroy_bio(bp); +} -	nbytes = job->uaiocb.aio_nbytes - bp->bio_resid; -	error = 0; -	if (bp->bio_flags & BIO_ERROR) -		error = bp->bio_error; +static void +aio_biowakeup(struct bio *bp) +{ +	struct kaiocb *job = (struct kaiocb *)bp->bio_caller1; +	size_t nbytes; +	long bcount = bp->bio_bcount; +	long resid = bp->bio_resid; +	int error, opcode, nblks; +	int bio_error = bp->bio_error; +	uint16_t flags = bp->bio_flags; + +	opcode = job->uaiocb.aio_lio_opcode; + +	aio_biocleanup(bp); + +	nbytes =bcount - resid; +	atomic_add_acq_long(&job->nbytes, nbytes);  	nblks = btodb(nbytes); -	if (job->uaiocb.aio_lio_opcode == LIO_WRITE) -		job->outblock += nblks; +	error = 0; +	/* +	 * If multiple bios experienced an error, the job will reflect the +	 * error of whichever failed bio completed last. +	 */ +	if (flags & BIO_ERROR) +		atomic_set_int(&job->error, bio_error); +	if (opcode == LIO_WRITE || opcode == LIO_WRITEV) +		atomic_add_int(&job->outblock, nblks);  	else -		job->inblock += nblks; +		atomic_add_int(&job->inblock, nblks); +	atomic_subtract_int(&job->nbio, 1); -	if (error) -		aio_complete(job, -1, error); -	else -		aio_complete(job, nbytes, 0); -	g_destroy_bio(bp); +	if (atomic_load_int(&job->nbio) == 0) { +		if (atomic_load_int(&job->error)) +			aio_complete(job, -1, job->error); +		else +			aio_complete(job, atomic_load_long(&job->nbytes), 0); +	}  }  /* syscall - wait for the next completion of an aio request */ @@ -2614,8 +2727,8 @@ typedef struct oaiocb32 {  typedef struct aiocb32 {  	int32_t	aio_fildes;		/* File descriptor */  	uint64_t aio_offset __packed;	/* File offset for I/O */ -	uint32_t aio_buf;		/* I/O buffer in process space */ -	uint32_t aio_nbytes;		/* Number of bytes for I/O */ +	uint32_t aio_buf;	/* I/O buffer in process space */ +	uint32_t aio_nbytes;	/* Number of bytes for I/O */  	int	__spare__[2];  	uint32_t __spare2__;  	int	aio_lio_opcode;		/* LIO opcode */ @@ -2652,49 +2765,67 @@ convert_old_sigevent32(struct osigevent32 *osig, struct sigevent *nsig)  }  static int -aiocb32_copyin_old_sigevent(struct aiocb *ujob, struct aiocb *kjob) +aiocb32_copyin_old_sigevent(struct aiocb *ujob, struct kaiocb *kjob, +    int type __unused)  {  	struct oaiocb32 job32; +	struct aiocb *kcb = &kjob->uaiocb;  	int error; -	bzero(kjob, sizeof(struct aiocb)); +	bzero(kcb, sizeof(struct aiocb));  	error = copyin(ujob, &job32, sizeof(job32));  	if (error)  		return (error); -	CP(job32, *kjob, aio_fildes); -	CP(job32, *kjob, aio_offset); -	PTRIN_CP(job32, *kjob, aio_buf); -	CP(job32, *kjob, aio_nbytes); -	CP(job32, *kjob, aio_lio_opcode); -	CP(job32, *kjob, aio_reqprio); -	CP(job32, *kjob, _aiocb_private.status); -	CP(job32, *kjob, _aiocb_private.error); -	PTRIN_CP(job32, *kjob, _aiocb_private.kernelinfo); +	/* No need to copyin aio_iov, because it did not exist in FreeBSD 6 */ + +	CP(job32, *kcb, aio_fildes); +	CP(job32, *kcb, aio_offset); +	PTRIN_CP(job32, *kcb, aio_buf); +	CP(job32, *kcb, aio_nbytes); +	CP(job32, *kcb, aio_lio_opcode); +	CP(job32, *kcb, aio_reqprio); +	CP(job32, *kcb, _aiocb_private.status); +	CP(job32, *kcb, _aiocb_private.error); +	PTRIN_CP(job32, *kcb, _aiocb_private.kernelinfo);  	return (convert_old_sigevent32(&job32.aio_sigevent, -	    &kjob->aio_sigevent)); +	    &kcb->aio_sigevent));  }  #endif  static int -aiocb32_copyin(struct aiocb *ujob, struct aiocb *kjob) +aiocb32_copyin(struct aiocb *ujob, struct kaiocb *kjob, int type)  {  	struct aiocb32 job32; +	struct aiocb *kcb = &kjob->uaiocb; +	struct iovec32 *iov32;  	int error;  	error = copyin(ujob, &job32, sizeof(job32));  	if (error)  		return (error); -	CP(job32, *kjob, aio_fildes); -	CP(job32, *kjob, aio_offset); -	PTRIN_CP(job32, *kjob, aio_buf); -	CP(job32, *kjob, aio_nbytes); -	CP(job32, *kjob, aio_lio_opcode); -	CP(job32, *kjob, aio_reqprio); -	CP(job32, *kjob, _aiocb_private.status); -	CP(job32, *kjob, _aiocb_private.error); -	PTRIN_CP(job32, *kjob, _aiocb_private.kernelinfo); -	return (convert_sigevent32(&job32.aio_sigevent, &kjob->aio_sigevent)); +	CP(job32, *kcb, aio_fildes); +	CP(job32, *kcb, aio_offset); +	CP(job32, *kcb, aio_lio_opcode); +	if (type == LIO_READV || type == LIO_WRITEV) { +		iov32 = PTRIN(job32.aio_iov); +		CP(job32, *kcb, aio_iovcnt); +		/* malloc a uio and copy in the iovec */ +		error = freebsd32_copyinuio(iov32, +		    kcb->aio_iovcnt, &kjob->uiop); +		if (error) +			return (error); +	} else { +		PTRIN_CP(job32, *kcb, aio_buf); +		CP(job32, *kcb, aio_nbytes); +	} +	CP(job32, *kcb, aio_reqprio); +	CP(job32, *kcb, _aiocb_private.status); +	CP(job32, *kcb, _aiocb_private.error); +	PTRIN_CP(job32, *kcb, _aiocb_private.kernelinfo); +	error = convert_sigevent32(&job32.aio_sigevent, &kcb->aio_sigevent); + +	return (error);  }  static long @@ -2840,6 +2971,14 @@ freebsd32_aio_read(struct thread *td, struct freebsd32_aio_read_args *uap)  	    &aiocb32_ops));  } +int +freebsd32_aio_readv(struct thread *td, struct freebsd32_aio_readv_args *uap) +{ + +	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READV, +	    &aiocb32_ops)); +} +  #ifdef COMPAT_FREEBSD6  int  freebsd6_freebsd32_aio_write(struct thread *td, @@ -2860,6 +2999,14 @@ freebsd32_aio_write(struct thread *td, struct freebsd32_aio_write_args *uap)  }  int +freebsd32_aio_writev(struct thread *td, struct freebsd32_aio_writev_args *uap) +{ + +	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITEV, +	    &aiocb32_ops)); +} + +int  freebsd32_aio_mlock(struct thread *td, struct freebsd32_aio_mlock_args *uap)  { diff --git a/sys/sys/aio.h b/sys/sys/aio.h index 71afdbf31b35..c0e2b4eaaaf6 100644 --- a/sys/sys/aio.h +++ b/sys/sys/aio.h @@ -27,6 +27,7 @@  #include <sys/queue.h>  #include <sys/event.h>  #include <sys/signalvar.h> +#include <sys/uio.h>  #endif  /* @@ -45,6 +46,8 @@  #ifdef _KERNEL  #define	LIO_SYNC		0x3  #define	LIO_MLOCK		0x4 +#define	LIO_WRITEV		0x5 +#define	LIO_READV		0x6  #endif  /* @@ -92,7 +95,7 @@ struct __aiocb_private {  typedef struct aiocb {  	int	aio_fildes;		/* File descriptor */  	off_t	aio_offset;		/* File offset for I/O */ -	volatile void *aio_buf;         /* I/O buffer in process space */ +	volatile void *aio_buf;		/* I/O buffer in process space */  	size_t	aio_nbytes;		/* Number of bytes for I/O */  	int	__spare__[2];  	void	*__spare2__; @@ -102,6 +105,9 @@ typedef struct aiocb {  	struct	sigevent aio_sigevent;	/* Signal to deliver */  } aiocb_t; +#define	aio_iov	aio_buf			/* I/O scatter/gather list */ +#define	aio_iovcnt	aio_nbytes	/* Length of aio_iov */ +  #ifdef _KERNEL  typedef void aio_cancel_fn_t(struct kaiocb *); @@ -132,11 +138,19 @@ struct kaiocb {  	struct	aiocb *ujob;		/* (*) pointer in userspace of aiocb */  	struct	knlist klist;		/* (a) list of knotes */  	struct	aiocb uaiocb;		/* (*) copy of user I/O control block */ +	struct	uio uio;		/* (*) storage for non-vectored uio */ +	struct	iovec iov[1];		/* (*) storage for non-vectored uio */ +	struct	uio *uiop;		/* (*) Possibly malloced uio */  	ksiginfo_t ksi;			/* (a) realtime signal info */  	uint64_t seqno;			/* (*) job number */  	aio_cancel_fn_t *cancel_fn;	/* (a) backend cancel function */  	aio_handle_fn_t *handle_fn;	/* (c) backend handle function */  	union {				/* Backend-specific data fields */ +		struct {		/* BIO backend */ +			int	nbio;	/* Number of remaining bios */ +			int	error;	/* Worst error of all bios */ +			long	nbytes;	/* Bytes completed so far */ +		};  		struct {		/* fsync() requests */  			int	pending; /* (a) number of pending I/O */  		}; @@ -202,11 +216,17 @@ __BEGIN_DECLS   * Asynchronously read from a file   */  int	aio_read(struct aiocb *); +#if __BSD_VISIBLE +int	aio_readv(struct aiocb *); +#endif  /*   * Asynchronously write to file   */  int	aio_write(struct aiocb *); +#if __BSD_VISIBLE +int	aio_writev(struct aiocb *); +#endif  /*   * List I/O Asynchronously/synchronously read/write to/from file diff --git a/tests/sys/aio/aio_test.c b/tests/sys/aio/aio_test.c index a9216335d768..891892e5e757 100644 --- a/tests/sys/aio/aio_test.c +++ b/tests/sys/aio/aio_test.c @@ -282,6 +282,47 @@ aio_write_test(struct aio_context *ac, completion comp, struct sigevent *sev)  }  /* + * Perform a vectored I/O test of our initialized data buffer to the provided + * file descriptor. + * + * To vectorize the linear buffer, chop it up into two pieces of dissimilar + * size, and swap their offsets. + */ +static void +aio_writev_test(struct aio_context *ac, completion comp, struct sigevent *sev) +{ +	struct aiocb aio; +	struct iovec iov[2]; +	size_t len0, len1; +	ssize_t len; + +	bzero(&aio, sizeof(aio)); + +	aio.aio_fildes = ac->ac_write_fd; +	aio.aio_offset = 0; +	len0 = ac->ac_buflen * 3 / 4; +	len1 = ac->ac_buflen / 4; +	iov[0].iov_base = ac->ac_buffer + len1; +	iov[0].iov_len = len0; +	iov[1].iov_base = ac->ac_buffer; +	iov[1].iov_len = len1; +	aio.aio_iov = iov; +	aio.aio_iovcnt = 2; +	if (sev) +		aio.aio_sigevent = *sev; + +	if (aio_writev(&aio) < 0) +		atf_tc_fail("aio_writev failed: %s", strerror(errno)); + +	len = comp(&aio); +	if (len < 0) +		atf_tc_fail("aio failed: %s", strerror(errno)); + +	if (len != ac->ac_buflen) +		atf_tc_fail("aio short write (%jd)", (intmax_t)len); +} + +/*   * Perform a simple read test of our initialized data buffer from the   * provided file descriptor.   */ @@ -314,6 +355,43 @@ aio_read_test(struct aio_context *ac, completion comp, struct sigevent *sev)  		atf_tc_fail("buffer mismatched");  } +static void +aio_readv_test(struct aio_context *ac, completion comp, struct sigevent *sev) +{ +	struct aiocb aio; +	struct iovec iov[2]; +	size_t len0, len1; +	ssize_t len; + +	bzero(ac->ac_buffer, ac->ac_buflen); +	bzero(&aio, sizeof(aio)); +	aio.aio_fildes = ac->ac_read_fd; +	aio.aio_offset = 0; +	len0 = ac->ac_buflen * 3 / 4; +	len1 = ac->ac_buflen / 4; +	iov[0].iov_base = ac->ac_buffer + len1; +	iov[0].iov_len = len0; +	iov[1].iov_base = ac->ac_buffer; +	iov[1].iov_len = len1; +	aio.aio_iov = iov; +	aio.aio_iovcnt = 2; +	if (sev) +		aio.aio_sigevent = *sev; + +	if (aio_readv(&aio) < 0) +		atf_tc_fail("aio_read failed: %s", strerror(errno)); + +	len = comp(&aio); +	if (len < 0) +		atf_tc_fail("aio failed: %s", strerror(errno)); + +	ATF_REQUIRE_EQ_MSG(len, ac->ac_buflen, +	    "aio short read (%jd)", (intmax_t)len); + +	if (aio_test_buffer(ac->ac_buffer, ac->ac_buflen, ac->ac_seed) == 0) +		atf_tc_fail("buffer mismatched"); +} +  /*   * Series of type-specific tests for AIO.  For now, we just make sure we can   * issue a write and then a read to each type.  We assume that once a write @@ -328,7 +406,7 @@ aio_read_test(struct aio_context *ac, completion comp, struct sigevent *sev)  #define	FILE_PATHNAME	"testfile"  static void -aio_file_test(completion comp, struct sigevent *sev) +aio_file_test(completion comp, struct sigevent *sev, bool vectored)  {  	struct aio_context ac;  	int fd; @@ -340,39 +418,44 @@ aio_file_test(completion comp, struct sigevent *sev)  	ATF_REQUIRE_MSG(fd != -1, "open failed: %s", strerror(errno));  	aio_context_init(&ac, fd, fd, FILE_LEN); -	aio_write_test(&ac, comp, sev); -	aio_read_test(&ac, comp, sev); +	if (vectored) { +		aio_writev_test(&ac, comp, sev); +		aio_readv_test(&ac, comp, sev); +	} else { +		aio_write_test(&ac, comp, sev); +		aio_read_test(&ac, comp, sev); +	}  	close(fd);  }  ATF_TC_WITHOUT_HEAD(file_poll);  ATF_TC_BODY(file_poll, tc)  { -	aio_file_test(poll, NULL); +	aio_file_test(poll, NULL, false);  }  ATF_TC_WITHOUT_HEAD(file_signal);  ATF_TC_BODY(file_signal, tc)  { -	aio_file_test(poll_signaled, setup_signal()); +	aio_file_test(poll_signaled, setup_signal(), false);  }  ATF_TC_WITHOUT_HEAD(file_suspend);  ATF_TC_BODY(file_suspend, tc)  { -	aio_file_test(suspend, NULL); +	aio_file_test(suspend, NULL, false);  }  ATF_TC_WITHOUT_HEAD(file_thread);  ATF_TC_BODY(file_thread, tc)  { -	aio_file_test(poll_signaled, setup_thread()); +	aio_file_test(poll_signaled, setup_thread(), false);  }  ATF_TC_WITHOUT_HEAD(file_waitcomplete);  ATF_TC_BODY(file_waitcomplete, tc)  { -	aio_file_test(waitcomplete, NULL); +	aio_file_test(waitcomplete, NULL, false);  }  #define	FIFO_LEN	256 @@ -446,7 +529,7 @@ ATF_TC_BODY(fifo_waitcomplete, tc)  #define	UNIX_SOCKETPAIR_LEN	256  static void -aio_unix_socketpair_test(completion comp, struct sigevent *sev) +aio_unix_socketpair_test(completion comp, struct sigevent *sev, bool vectored)  {  	struct aio_context ac;  	struct rusage ru_before, ru_after; @@ -460,14 +543,16 @@ aio_unix_socketpair_test(completion comp, struct sigevent *sev)  	aio_context_init(&ac, sockets[0], sockets[1], UNIX_SOCKETPAIR_LEN);  	ATF_REQUIRE_MSG(getrusage(RUSAGE_SELF, &ru_before) != -1,  	    "getrusage failed: %s", strerror(errno)); -	aio_write_test(&ac, comp, sev); +	if (vectored) { +		aio_writev_test(&ac, comp, sev); +		aio_readv_test(&ac, comp, sev); +	} else { +		aio_write_test(&ac, comp, sev); +		aio_read_test(&ac, comp, sev); +	}  	ATF_REQUIRE_MSG(getrusage(RUSAGE_SELF, &ru_after) != -1,  	    "getrusage failed: %s", strerror(errno));  	ATF_REQUIRE(ru_after.ru_msgsnd == ru_before.ru_msgsnd + 1); -	ru_before = ru_after; -	aio_read_test(&ac, comp, sev); -	ATF_REQUIRE_MSG(getrusage(RUSAGE_SELF, &ru_after) != -1, -	    "getrusage failed: %s", strerror(errno));  	ATF_REQUIRE(ru_after.ru_msgrcv == ru_before.ru_msgrcv + 1);  	close(sockets[0]); @@ -477,31 +562,31 @@ aio_unix_socketpair_test(completion comp, struct sigevent *sev)  ATF_TC_WITHOUT_HEAD(socket_poll);  ATF_TC_BODY(socket_poll, tc)  { -	aio_unix_socketpair_test(poll, NULL); +	aio_unix_socketpair_test(poll, NULL, false);  }  ATF_TC_WITHOUT_HEAD(socket_signal);  ATF_TC_BODY(socket_signal, tc)  { -	aio_unix_socketpair_test(poll_signaled, setup_signal()); +	aio_unix_socketpair_test(poll_signaled, setup_signal(), false);  }  ATF_TC_WITHOUT_HEAD(socket_suspend);  ATF_TC_BODY(socket_suspend, tc)  { -	aio_unix_socketpair_test(suspend, NULL); +	aio_unix_socketpair_test(suspend, NULL, false);  }  ATF_TC_WITHOUT_HEAD(socket_thread);  ATF_TC_BODY(socket_thread, tc)  { -	aio_unix_socketpair_test(poll_signaled, setup_thread()); +	aio_unix_socketpair_test(poll_signaled, setup_thread(), false);  }  ATF_TC_WITHOUT_HEAD(socket_waitcomplete);  ATF_TC_BODY(socket_waitcomplete, tc)  { -	aio_unix_socketpair_test(waitcomplete, NULL); +	aio_unix_socketpair_test(waitcomplete, NULL, false);  }  struct aio_pty_arg { @@ -629,40 +714,11 @@ ATF_TC_BODY(pipe_waitcomplete, tc)  #define	MD_LEN		GLOBAL_MAX  #define	MDUNIT_LINK	"mdunit_link" -static void -aio_md_cleanup(void) -{ -	struct md_ioctl mdio; -	int mdctl_fd, error, n, unit; -	char buf[80]; - -	mdctl_fd = open("/dev/" MDCTL_NAME, O_RDWR, 0); -	ATF_REQUIRE(mdctl_fd >= 0); -	n = readlink(MDUNIT_LINK, buf, sizeof(buf)); -	if (n > 0) { -		if (sscanf(buf, "%d", &unit) == 1 && unit >= 0) { -			bzero(&mdio, sizeof(mdio)); -			mdio.md_version = MDIOVERSION; -			mdio.md_unit = unit; -			if (ioctl(mdctl_fd, MDIOCDETACH, &mdio) == -1) { -				error = errno; -				close(mdctl_fd); -				errno = error; -				atf_tc_fail("ioctl MDIOCDETACH failed: %s", -				    strerror(errno)); -			} -		} -	} -		 -	close(mdctl_fd); -} - -static void -aio_md_test(completion comp, struct sigevent *sev) +static int +aio_md_setup(void)  {  	int error, fd, mdctl_fd, unit;  	char pathname[PATH_MAX]; -	struct aio_context ac;  	struct md_ioctl mdio;  	char buf[80]; @@ -695,9 +751,52 @@ aio_md_test(completion comp, struct sigevent *sev)  	ATF_REQUIRE_MSG(fd != -1,  	    "opening %s failed: %s", pathname, strerror(errno)); +	return (fd); +} + +static void +aio_md_cleanup(void) +{ +	struct md_ioctl mdio; +	int mdctl_fd, error, n, unit; +	char buf[80]; + +	mdctl_fd = open("/dev/" MDCTL_NAME, O_RDWR, 0); +	ATF_REQUIRE(mdctl_fd >= 0); +	n = readlink(MDUNIT_LINK, buf, sizeof(buf)); +	if (n > 0) { +		if (sscanf(buf, "%d", &unit) == 1 && unit >= 0) { +			bzero(&mdio, sizeof(mdio)); +			mdio.md_version = MDIOVERSION; +			mdio.md_unit = unit; +			if (ioctl(mdctl_fd, MDIOCDETACH, &mdio) == -1) { +				error = errno; +				close(mdctl_fd); +				errno = error; +				atf_tc_fail("ioctl MDIOCDETACH failed: %s", +				    strerror(errno)); +			} +		} +	} + +	close(mdctl_fd); +} + +static void +aio_md_test(completion comp, struct sigevent *sev, bool vectored) +{ +	struct aio_context ac; +	int fd; + +	fd = aio_md_setup();  	aio_context_init(&ac, fd, fd, MD_LEN); -	aio_write_test(&ac, comp, sev); -	aio_read_test(&ac, comp, sev); +	if (vectored) { +		aio_writev_test(&ac, comp, sev); +		aio_readv_test(&ac, comp, sev); +	} else { +		aio_write_test(&ac, comp, sev); +		aio_read_test(&ac, comp, sev); +	}  	close(fd);  } @@ -710,7 +809,7 @@ ATF_TC_HEAD(md_poll, tc)  }  ATF_TC_BODY(md_poll, tc)  { -	aio_md_test(poll, NULL); +	aio_md_test(poll, NULL, false);  }  ATF_TC_CLEANUP(md_poll, tc)  { @@ -725,7 +824,7 @@ ATF_TC_HEAD(md_signal, tc)  }  ATF_TC_BODY(md_signal, tc)  { -	aio_md_test(poll_signaled, setup_signal()); +	aio_md_test(poll_signaled, setup_signal(), false);  }  ATF_TC_CLEANUP(md_signal, tc)  { @@ -740,7 +839,7 @@ ATF_TC_HEAD(md_suspend, tc)  }  ATF_TC_BODY(md_suspend, tc)  { -	aio_md_test(suspend, NULL); +	aio_md_test(suspend, NULL, false);  }  ATF_TC_CLEANUP(md_suspend, tc)  { @@ -755,7 +854,7 @@ ATF_TC_HEAD(md_thread, tc)  }  ATF_TC_BODY(md_thread, tc)  { -	aio_md_test(poll_signaled, setup_thread()); +	aio_md_test(poll_signaled, setup_thread(), false);  }  ATF_TC_CLEANUP(md_thread, tc)  { @@ -770,13 +869,89 @@ ATF_TC_HEAD(md_waitcomplete, tc)  }  ATF_TC_BODY(md_waitcomplete, tc)  { -	aio_md_test(waitcomplete, NULL); +	aio_md_test(waitcomplete, NULL, false);  }  ATF_TC_CLEANUP(md_waitcomplete, tc)  {  	aio_md_cleanup();  } +#define	ZVOL_VDEV_PATHNAME	"test_vdev" +#define POOL_SIZE		(1 << 28)	/* 256 MB */ +#define ZVOL_SIZE		"64m" +#define POOL_NAME		"aio_testpool" +#define ZVOL_NAME		"aio_testvol" + +static int +aio_zvol_setup(void) +{ +	FILE *pidfile; +	int fd; +	pid_t pid; +	char pool_name[80]; +	char cmd[160]; +	char zvol_name[160]; +	char devname[160]; + +	ATF_REQUIRE_KERNEL_MODULE("aio"); +	ATF_REQUIRE_KERNEL_MODULE("zfs"); + +	fd = open(ZVOL_VDEV_PATHNAME, O_RDWR | O_CREAT, 0600); +	ATF_REQUIRE_MSG(fd != -1, "open failed: %s", strerror(errno)); +	ATF_REQUIRE_EQ_MSG(0, +	    ftruncate(fd, POOL_SIZE), "ftruncate failed: %s", strerror(errno)); +	close(fd); + +	pid = getpid(); +	pidfile = fopen("pidfile", "w"); +	ATF_REQUIRE_MSG(NULL != pidfile, "fopen: %s", strerror(errno)); +	fprintf(pidfile, "%d", pid); +	fclose(pidfile); + +	snprintf(pool_name, sizeof(pool_name), POOL_NAME ".%d", pid); +	snprintf(zvol_name, sizeof(zvol_name), "%s/" ZVOL_NAME, pool_name); +	snprintf(cmd, sizeof(cmd), "zpool create %s $PWD/" ZVOL_VDEV_PATHNAME, +	    pool_name); +	ATF_REQUIRE_EQ_MSG(0, system(cmd), +	    "zpool create failed: %s", strerror(errno)); +	snprintf(cmd, sizeof(cmd), +	    "zfs create -o volblocksize=8192 -o volmode=dev -V " +		ZVOL_SIZE " %s", zvol_name); +	ATF_REQUIRE_EQ_MSG(0, system(cmd), +	    "zfs create failed: %s", strerror(errno)); +	/* +	 * XXX Due to bug 251828, we need an extra "zfs set" here +	 * https://bugs.freebsd.org/bugzilla/show_bug.cgi?id=251828 +	 */ +	snprintf(cmd, sizeof(cmd), "zfs set volmode=dev %s", zvol_name); +	ATF_REQUIRE_EQ_MSG(0, system(cmd), +	    "zfs set failed: %s", strerror(errno)); + +	snprintf(devname, sizeof(devname), "/dev/zvol/%s", zvol_name); +	do { +		fd = open(devname, O_RDWR); +	} while (fd == -1 && errno == EINTR) ; +	ATF_REQUIRE_MSG(fd != -1, "open failed: %s", strerror(errno)); +	return (fd); +} + +static void +aio_zvol_cleanup(void) +{ +	FILE *pidfile; +	pid_t testpid; +	char cmd[160]; + +	pidfile = fopen("pidfile", "r"); +	ATF_REQUIRE_MSG(NULL != pidfile, "fopen: %s", strerror(errno)); +	ATF_REQUIRE_EQ(1, fscanf(pidfile, "%d", &testpid)); +	fclose(pidfile); + +	snprintf(cmd, sizeof(cmd), "zpool destroy " POOL_NAME ".%d", testpid); +	system(cmd); +} + +  ATF_TC_WITHOUT_HEAD(aio_large_read_test);  ATF_TC_BODY(aio_large_read_test, tc)  { @@ -907,16 +1082,13 @@ ATF_TC_BODY(aio_socket_two_reads, tc)  	close(s[0]);  } -/* - * This test ensures that aio_write() on a blocking socket of a "large" - * buffer does not return a short completion. - */ -ATF_TC_WITHOUT_HEAD(aio_socket_blocking_short_write); -ATF_TC_BODY(aio_socket_blocking_short_write, tc) +static void +aio_socket_blocking_short_write_test(bool vectored)  {  	struct aiocb iocb, *iocbp; +	struct iovec iov[2];  	char *buffer[2]; -	ssize_t done; +	ssize_t done, r;  	int buffer_size, sb_size;  	socklen_t len;  	int s[2]; @@ -954,9 +1126,21 @@ ATF_TC_BODY(aio_socket_blocking_short_write, tc)  	memset(&iocb, 0, sizeof(iocb));  	iocb.aio_fildes = s[1]; -	iocb.aio_buf = buffer[1]; -	iocb.aio_nbytes = buffer_size; -	ATF_REQUIRE(aio_write(&iocb) == 0); +	if (vectored) { +		iov[0].iov_base = buffer[1]; +		iov[0].iov_len = buffer_size / 2 + 1; +		iov[1].iov_base = buffer[1] + buffer_size / 2 + 1; +		iov[1].iov_len = buffer_size / 2 - 1; +		iocb.aio_iov = iov; +		iocb.aio_iovcnt = 2; +		r = aio_writev(&iocb); +		ATF_CHECK_EQ_MSG(0, r, "aio_writev returned %zd", r); +	} else { +		iocb.aio_buf = buffer[1]; +		iocb.aio_nbytes = buffer_size; +		r = aio_write(&iocb); +		ATF_CHECK_EQ_MSG(0, r, "aio_writev returned %zd", r); +	}  	done = recv(s[0], buffer[0], buffer_size, MSG_WAITALL);  	ATF_REQUIRE(done == buffer_size); @@ -972,6 +1156,26 @@ ATF_TC_BODY(aio_socket_blocking_short_write, tc)  }  /* + * This test ensures that aio_write() on a blocking socket of a "large" + * buffer does not return a short completion. + */ +ATF_TC_WITHOUT_HEAD(aio_socket_blocking_short_write); +ATF_TC_BODY(aio_socket_blocking_short_write, tc) +{ +	aio_socket_blocking_short_write_test(false); +} + +/* + * Like aio_socket_blocking_short_write, but also tests that partially + * completed vectored sends can be retried correctly. + */ +ATF_TC_WITHOUT_HEAD(aio_socket_blocking_short_write_vectored); +ATF_TC_BODY(aio_socket_blocking_short_write_vectored, tc) +{ +	aio_socket_blocking_short_write_test(true); +} + +/*   * This test verifies that cancelling a partially completed socket write   * returns a short write rather than ECANCELED.   */ @@ -1155,6 +1359,395 @@ ATF_TC_BODY(aio_fsync_test, tc)  	close(fd);  } +/* + * We shouldn't be able to DoS the system by setting iov_len to an insane + * value + */ +ATF_TC_WITHOUT_HEAD(aio_writev_dos_iov_len); +ATF_TC_BODY(aio_writev_dos_iov_len, tc) +{ +	struct aiocb aio; +	const struct aiocb *const iocbs[] = {&aio}; +	const char *wbuf = "Hello, world!"; +	struct iovec iov[1]; +	ssize_t len, r; +	int fd; + +	ATF_REQUIRE_KERNEL_MODULE("aio"); +	ATF_REQUIRE_UNSAFE_AIO(); + +	fd = open("testfile", O_RDWR | O_CREAT, 0600); +	ATF_REQUIRE_MSG(fd != -1, "open failed: %s", strerror(errno)); + +	len = strlen(wbuf); +	iov[0].iov_base = __DECONST(void*, wbuf); +	iov[0].iov_len = 1 << 30; +	bzero(&aio, sizeof(aio)); +	aio.aio_fildes = fd; +	aio.aio_offset = 0; +	aio.aio_iov = iov; +	aio.aio_iovcnt = 1; + +	r = aio_writev(&aio); +	ATF_CHECK_EQ_MSG(0, r, "aio_writev returned %zd", r); +	ATF_REQUIRE_EQ(0, aio_suspend(iocbs, 1, NULL)); +	r = aio_return(&aio); +	ATF_CHECK_EQ_MSG(-1, r, "aio_return returned %zd", r); +	ATF_CHECK_MSG(errno == EFAULT || errno == EINVAL, +	    "aio_writev: %s", strerror(errno)); + +	close(fd); +} + +/* + * We shouldn't be able to DoS the system by setting aio_iovcnt to an insane + * value + */ +ATF_TC_WITHOUT_HEAD(aio_writev_dos_iovcnt); +ATF_TC_BODY(aio_writev_dos_iovcnt, tc) +{ +	struct aiocb aio; +	const char *wbuf = "Hello, world!"; +	struct iovec iov[1]; +	ssize_t len; +	int fd; + +	ATF_REQUIRE_KERNEL_MODULE("aio"); +	ATF_REQUIRE_UNSAFE_AIO(); + +	fd = open("testfile", O_RDWR | O_CREAT, 0600); +	ATF_REQUIRE_MSG(fd != -1, "open failed: %s", strerror(errno)); + +	len = strlen(wbuf); +	iov[0].iov_base = __DECONST(void*, wbuf); +	iov[0].iov_len = len; +	bzero(&aio, sizeof(aio)); +	aio.aio_fildes = fd; +	aio.aio_offset = 0; +	aio.aio_iov = iov; +	aio.aio_iovcnt = 1 << 30; + +	ATF_REQUIRE_EQ(-1, aio_writev(&aio)); +	ATF_CHECK_EQ(EINVAL, errno); + +	close(fd); +} + +ATF_TC_WITH_CLEANUP(aio_writev_efault); +ATF_TC_HEAD(aio_writev_efault, tc) +{ +	atf_tc_set_md_var(tc, "descr", +	    "Vectored AIO should gracefully handle invalid addresses"); +	atf_tc_set_md_var(tc, "require.user", "root"); +} +ATF_TC_BODY(aio_writev_efault, tc) +{ +	struct aiocb aio; +	ssize_t buflen; +	char *buffer; +	struct iovec iov[2]; +	long seed; +	int fd; + +	ATF_REQUIRE_KERNEL_MODULE("aio"); +	ATF_REQUIRE_UNSAFE_AIO(); + +	fd = aio_md_setup(); + +	seed = random(); +	buflen = 4096; +	buffer = malloc(buflen); +	aio_fill_buffer(buffer, buflen, seed); +	iov[0].iov_base = buffer; +	iov[0].iov_len = buflen; +	iov[1].iov_base = (void*)-1;	/* Invalid! */ +	iov[1].iov_len = buflen; +	bzero(&aio, sizeof(aio)); +	aio.aio_fildes = fd; +	aio.aio_offset = 0; +	aio.aio_iov = iov; +	aio.aio_iovcnt = nitems(iov); + +	ATF_REQUIRE_EQ(-1, aio_writev(&aio)); +	ATF_CHECK_EQ(EFAULT, errno); + +	close(fd); +} +ATF_TC_CLEANUP(aio_writev_efault, tc) +{ +	aio_md_cleanup(); +} + +ATF_TC_WITHOUT_HEAD(aio_writev_empty_file_poll); +ATF_TC_BODY(aio_writev_empty_file_poll, tc) +{ +	struct aiocb aio; +	int fd; + +	ATF_REQUIRE_KERNEL_MODULE("aio"); +	ATF_REQUIRE_UNSAFE_AIO(); + +	fd = open("testfile", O_RDWR | O_CREAT, 0600); +	ATF_REQUIRE_MSG(fd != -1, "open failed: %s", strerror(errno)); + +	bzero(&aio, sizeof(aio)); +	aio.aio_fildes = fd; +	aio.aio_offset = 0; +	aio.aio_iovcnt = 0; + +	ATF_REQUIRE_EQ(0, aio_writev(&aio)); +	ATF_REQUIRE_EQ(0, suspend(&aio)); + +	close(fd); +} + +ATF_TC_WITHOUT_HEAD(aio_writev_empty_file_signal); +ATF_TC_BODY(aio_writev_empty_file_signal, tc) +{ +	struct aiocb aio; +	int fd; + +	ATF_REQUIRE_KERNEL_MODULE("aio"); +	ATF_REQUIRE_UNSAFE_AIO(); + +	fd = open("testfile", O_RDWR | O_CREAT, 0600); +	ATF_REQUIRE_MSG(fd != -1, "open failed: %s", strerror(errno)); + +	bzero(&aio, sizeof(aio)); +	aio.aio_fildes = fd; +	aio.aio_offset = 0; +	aio.aio_iovcnt = 0; +	aio.aio_sigevent = *setup_signal(); + +	ATF_REQUIRE_EQ(0, aio_writev(&aio)); +	ATF_REQUIRE_EQ(0, poll_signaled(&aio)); + +	close(fd); +} + +// aio_writev and aio_readv should still work even if the iovcnt is greater +// than the number of buffered AIO operations permitted per process. +ATF_TC_WITH_CLEANUP(vectored_big_iovcnt); +ATF_TC_HEAD(vectored_big_iovcnt, tc) +{ +	atf_tc_set_md_var(tc, "descr", +	    "Vectored AIO should still work even if the iovcnt is greater than " +	    "the number of buffered AIO operations permitted by the process"); +	atf_tc_set_md_var(tc, "require.user", "root"); +} +ATF_TC_BODY(vectored_big_iovcnt, tc) +{ +	struct aiocb aio; +	struct iovec *iov; +	ssize_t len, buflen; +	char *buffer; +	const char *oid = "vfs.aio.max_buf_aio"; +	long seed; +	int max_buf_aio; +	int fd, i; +	ssize_t sysctl_len = sizeof(max_buf_aio); + +	ATF_REQUIRE_KERNEL_MODULE("aio"); +	ATF_REQUIRE_UNSAFE_AIO(); + +	if (sysctlbyname(oid, &max_buf_aio, &sysctl_len, NULL, 0) == -1) +		atf_libc_error(errno, "Failed to read %s", oid); + +	seed = random(); +	buflen = 512 * (max_buf_aio + 1); +	buffer = malloc(buflen); +	aio_fill_buffer(buffer, buflen, seed); +	iov = calloc(max_buf_aio + 1, sizeof(struct iovec)); + +	fd = aio_md_setup(); + +	bzero(&aio, sizeof(aio)); +	aio.aio_fildes = fd; +	aio.aio_offset = 0; +	for (i = 0; i < max_buf_aio + 1; i++) { +		iov[i].iov_base = &buffer[i * 512]; +		iov[i].iov_len = 512; +	} +	aio.aio_iov = iov; +	aio.aio_iovcnt = max_buf_aio + 1; + +	if (aio_writev(&aio) < 0) +		atf_tc_fail("aio_writev failed: %s", strerror(errno)); + +	len = poll(&aio); +	if (len < 0) +		atf_tc_fail("aio failed: %s", strerror(errno)); + +	if (len != buflen) +		atf_tc_fail("aio short write (%jd)", (intmax_t)len); + +	bzero(&aio, sizeof(aio)); +	aio.aio_fildes = fd; +	aio.aio_offset = 0; +	aio.aio_iov = iov; +	aio.aio_iovcnt = max_buf_aio + 1; + +	if (aio_readv(&aio) < 0) +		atf_tc_fail("aio_readv failed: %s", strerror(errno)); + +	len = poll(&aio); +	if (len < 0) +		atf_tc_fail("aio failed: %s", strerror(errno)); + +	if (len != buflen) +		atf_tc_fail("aio short read (%jd)", (intmax_t)len); + +	if (aio_test_buffer(buffer, buflen, seed) == 0) +		atf_tc_fail("buffer mismatched"); + +	close(fd); +} +ATF_TC_CLEANUP(vectored_big_iovcnt, tc) +{ +	aio_md_cleanup(); +} + +ATF_TC_WITHOUT_HEAD(vectored_file_poll); +ATF_TC_BODY(vectored_file_poll, tc) +{ +	aio_file_test(poll, NULL, true); +} + +ATF_TC_WITH_CLEANUP(vectored_md_poll); +ATF_TC_HEAD(vectored_md_poll, tc) +{ +	atf_tc_set_md_var(tc, "require.user", "root"); +} +ATF_TC_BODY(vectored_md_poll, tc) +{ +	aio_md_test(poll, NULL, true); +} +ATF_TC_CLEANUP(vectored_md_poll, tc) +{ +	aio_md_cleanup(); +} + +ATF_TC_WITHOUT_HEAD(vectored_socket_poll); +ATF_TC_BODY(vectored_socket_poll, tc) +{ +	aio_unix_socketpair_test(poll, NULL, true); +} + +// aio_writev and aio_readv should still work even if the iov contains elements +// that aren't a multiple of the device's sector size, and even if the total +// amount if I/O _is_ a multiple of the device's sector size. +ATF_TC_WITH_CLEANUP(vectored_unaligned); +ATF_TC_HEAD(vectored_unaligned, tc) +{ +	atf_tc_set_md_var(tc, "descr", +	    "Vectored AIO should still work even if the iov contains elements " +	    "that aren't a multiple of the sector size."); +	atf_tc_set_md_var(tc, "require.user", "root"); +} +ATF_TC_BODY(vectored_unaligned, tc) +{ +	struct aio_context ac; +	struct aiocb aio; +	struct iovec iov[3]; +	ssize_t len, total_len; +	int fd; + +	ATF_REQUIRE_KERNEL_MODULE("aio"); +	ATF_REQUIRE_UNSAFE_AIO(); + +	/*  +	 * Use a zvol with volmode=dev, so it will allow .d_write with +	 * unaligned uio.  geom devices use physio, which doesn't allow that. +	 */ +	fd = aio_zvol_setup(); +	aio_context_init(&ac, fd, fd, FILE_LEN); + +	/* Break the buffer into 3 parts: +	 * * A 4kB part, aligned to 4kB +	 * * Two other parts that add up to 4kB: +	 *   - 256B +	 *   - 4kB - 256B +	 */ +	iov[0].iov_base = ac.ac_buffer; +	iov[0].iov_len = 4096; +	iov[1].iov_base = (void*)((uintptr_t)iov[0].iov_base + iov[0].iov_len); +	iov[1].iov_len = 256; +	iov[2].iov_base = (void*)((uintptr_t)iov[1].iov_base + iov[1].iov_len); +	iov[2].iov_len = 4096 - iov[1].iov_len; +	total_len = iov[0].iov_len + iov[1].iov_len + iov[2].iov_len; +	bzero(&aio, sizeof(aio)); +	aio.aio_fildes = ac.ac_write_fd; +	aio.aio_offset = 0; +	aio.aio_iov = iov; +	aio.aio_iovcnt = 3; + +	if (aio_writev(&aio) < 0) +		atf_tc_fail("aio_writev failed: %s", strerror(errno)); + +	len = poll(&aio); +	if (len < 0) +		atf_tc_fail("aio failed: %s", strerror(errno)); + +	if (len != total_len) +		atf_tc_fail("aio short write (%jd)", (intmax_t)len); + +	bzero(&aio, sizeof(aio)); +	aio.aio_fildes = ac.ac_read_fd; +	aio.aio_offset = 0; +	aio.aio_iov = iov; +	aio.aio_iovcnt = 3; + +	if (aio_readv(&aio) < 0) +		atf_tc_fail("aio_readv failed: %s", strerror(errno)); +	len = poll(&aio); + +	ATF_REQUIRE_MSG(aio_test_buffer(ac.ac_buffer, total_len, +	    ac.ac_seed) != 0, "aio_test_buffer: internal error"); + +	close(fd); +} +ATF_TC_CLEANUP(vectored_unaligned, tc) +{ +	aio_zvol_cleanup(); +} + +static void +aio_zvol_test(completion comp, struct sigevent *sev, bool vectored) +{ +	struct aio_context ac; +	int fd; + +	fd = aio_zvol_setup(); +	aio_context_init(&ac, fd, fd, MD_LEN); +	if (vectored) { +		aio_writev_test(&ac, comp, sev); +		aio_readv_test(&ac, comp, sev); +	} else { +		aio_write_test(&ac, comp, sev); +		aio_read_test(&ac, comp, sev); +	} + +	close(fd); +} + +/* + * Note that unlike md, the zvol is not a geom device, does not allow unmapped + * buffers, and does not use physio. + */ +ATF_TC_WITH_CLEANUP(vectored_zvol_poll); +ATF_TC_HEAD(vectored_zvol_poll, tc) +{ +	atf_tc_set_md_var(tc, "require.user", "root"); +} +ATF_TC_BODY(vectored_zvol_poll, tc) +{ +	aio_zvol_test(poll, NULL, true); +} +ATF_TC_CLEANUP(vectored_zvol_poll, tc) +{ +	aio_zvol_cleanup(); +} +  ATF_TP_ADD_TCS(tp)  { @@ -1193,7 +1786,19 @@ ATF_TP_ADD_TCS(tp)  	ATF_TP_ADD_TC(tp, aio_large_read_test);  	ATF_TP_ADD_TC(tp, aio_socket_two_reads);  	ATF_TP_ADD_TC(tp, aio_socket_blocking_short_write); +	ATF_TP_ADD_TC(tp, aio_socket_blocking_short_write_vectored);  	ATF_TP_ADD_TC(tp, aio_socket_short_write_cancel); +	ATF_TP_ADD_TC(tp, aio_writev_dos_iov_len); +	ATF_TP_ADD_TC(tp, aio_writev_dos_iovcnt); +	ATF_TP_ADD_TC(tp, aio_writev_efault); +	ATF_TP_ADD_TC(tp, aio_writev_empty_file_poll); +	ATF_TP_ADD_TC(tp, aio_writev_empty_file_signal); +	ATF_TP_ADD_TC(tp, vectored_big_iovcnt); +	ATF_TP_ADD_TC(tp, vectored_file_poll); +	ATF_TP_ADD_TC(tp, vectored_md_poll); +	ATF_TP_ADD_TC(tp, vectored_zvol_poll); +	ATF_TP_ADD_TC(tp, vectored_unaligned); +	ATF_TP_ADD_TC(tp, vectored_socket_poll);  	return (atf_no_error());  } | 
