diff options
53 files changed, 28580 insertions, 0 deletions
diff --git a/sys/kern/Make.tags.inc b/sys/kern/Make.tags.inc new file mode 100644 index 000000000000..79cb83a0a22d --- /dev/null +++ b/sys/kern/Make.tags.inc @@ -0,0 +1,19 @@ +# @(#)Make.tags.inc 8.2 (Berkeley) 11/23/94 + +# Common files for "make tags". +# Included by the Makefile for each architecture. + +# Put the ../sys stuff near the end so that subroutine definitions win when +# there is a struct tag with the same name (eg., vmmeter). The real +# solution would probably be for ctags to generate "struct vmmeter" tags. + +COMM= /sys/conf/*.[ch] \ + /sys/dev/*.[ch] /sys/dev/scsi/*.[ch] \ + /sys/isofs/*/*.[ch] \ + /sys/kern/*.[ch] /sys/libkern/*.[ch] \ + /sys/miscfs/*/*.[ch] \ + /sys/net/*.[ch] /sys/netccitt/*.[ch] /sys/netinet/*.[ch] \ + /sys/netiso/*.[ch] /sys/netns/*.[ch] \ + /sys/nfs/*.[ch] /sys/sys/*.[ch] \ + /sys/ufs/*/*.[ch] \ + /sys/vm/*.[ch] diff --git a/sys/kern/Makefile b/sys/kern/Makefile new file mode 100644 index 000000000000..3159d20e9691 --- /dev/null +++ b/sys/kern/Makefile @@ -0,0 +1,50 @@ +# @(#)Makefile 8.3 (Berkeley) 2/14/95 + +# Makefile for kernel tags files, init_sysent, etc. + +ARCH= hp300 i386 luna68k news3400 pmax sparc tahoe vax + +all: + @echo "make tags, make links or init_sysent.c only" + +init_sysent.c syscalls.c ../sys/syscall.h ../sys/syscallargs.h: makesyscalls.sh syscalls.master + -mv -f init_sysent.c init_sysent.c.bak + -mv -f syscalls.c syscalls.c.bak + -mv -f ../sys/syscall.h ../sys/syscall.h.bak + sh makesyscalls.sh syscalls.conf syscalls.master + +# Kernel tags: +# Tags files are built in the top-level directory for each architecture, +# with a makefile listing the architecture-dependent files, etc. The list +# of common files is in ./Make.tags.inc. Links to the correct tags file +# are placed in each source directory. We need to have links to tags files +# from the generic directories that are relative to the machine type, even +# via remote mounts; therefore we use symlinks to $SYSTAGS, which points at +# ${SYSDIR}/${MACHINE}/tags. + +SYSTAGS=/var/db/sys_tags +SYSDIR=/sys + +# Directories in which to place tags links (other than machine-dependent) +DGEN= conf \ + dev dev/scsi \ + hp hp/dev hp/hpux \ + kern libkern \ + miscfs miscfs/deadfs miscfs/fdesc miscfs/fifofs miscfs/kernfs \ + miscfs/lofs miscfs/nullfs miscfs/portal miscfs/procfs \ + miscfs/specfs miscfs/umapfs miscfs/union \ + net netccitt netinet netiso netns nfs scripts sys \ + ufs ufs/ffs ufs/lfs ufs/mfs ufs/ufs \ + vm + +tags:: + -for i in ${ARCH}; do \ + (cd ../$$i && make ${MFLAGS} tags); done + +links:: + rm -f ${SYSTAGS} + ln -s ${SYSDIR}/${MACHINE}/tags ${SYSTAGS} + -for i in ${DGEN}; do \ + (cd ../$$i && { rm -f tags; ln -s ${SYSTAGS} tags; }) done + -for i in ${ARCH}; do \ + (cd ../$$i && make ${MFLAGS} SYSTAGS=${SYSTAGS} links); done diff --git a/sys/kern/init_main.c b/sys/kern/init_main.c new file mode 100644 index 000000000000..61a0a14d5087 --- /dev/null +++ b/sys/kern/init_main.c @@ -0,0 +1,412 @@ +/* + * Copyright (c) 1982, 1986, 1989, 1991, 1992, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)init_main.c 8.16 (Berkeley) 5/14/95 + */ + +#include <sys/param.h> +#include <sys/filedesc.h> +#include <sys/errno.h> +#include <sys/exec.h> +#include <sys/kernel.h> +#include <sys/mount.h> +#include <sys/map.h> +#include <sys/proc.h> +#include <sys/resourcevar.h> +#include <sys/signalvar.h> +#include <sys/systm.h> +#include <sys/vnode.h> +#include <sys/conf.h> +#include <sys/buf.h> +#include <sys/clist.h> +#include <sys/device.h> +#include <sys/protosw.h> +#include <sys/reboot.h> +#include <sys/user.h> +#include <sys/syscallargs.h> + +#include <ufs/ufs/quota.h> + +#include <machine/cpu.h> + +#include <vm/vm.h> + +#ifdef HPFPLIB +char copyright[] = +"Copyright (c) 1982, 1986, 1989, 1991, 1993\n\tThe Regents of the University of California.\nCopyright (c) 1992 Hewlett-Packard Company\nCopyright (c) 1992 Motorola Inc.\nAll rights reserved.\n\n"; +#else +char copyright[] = +"Copyright (c) 1982, 1986, 1989, 1991, 1993\n\tThe Regents of the University of California. All rights reserved.\n\n"; +#endif + +/* Components of the first process -- never freed. */ +struct session session0; +struct pgrp pgrp0; +struct proc proc0; +struct pcred cred0; +struct filedesc0 filedesc0; +struct plimit limit0; +struct vmspace vmspace0; +struct proc *curproc = &proc0; +struct proc *initproc, *pageproc; + +int cmask = CMASK; +extern struct user *proc0paddr; + +struct vnode *rootvp, *swapdev_vp; +int boothowto; +struct timeval boottime; +struct timeval runtime; + +static void start_init __P((struct proc *p, void *framep)); + +/* + * System startup; initialize the world, create process 0, mount root + * filesystem, and fork to create init and pagedaemon. Most of the + * hard work is done in the lower-level initialization routines including + * startup(), which does memory initialization and autoconfiguration. + */ +main(framep) + void *framep; +{ + register struct proc *p; + register struct filedesc0 *fdp; + register struct pdevinit *pdev; + register int i; + int s; + register_t rval[2]; + extern struct pdevinit pdevinit[]; + extern void roundrobin __P((void *)); + extern void schedcpu __P((void *)); + + /* + * Initialize the current process pointer (curproc) before + * any possible traps/probes to simplify trap processing. + */ + p = &proc0; + curproc = p; + /* + * Attempt to find console and initialize + * in case of early panic or other messages. + */ + consinit(); + printf(copyright); + + vm_mem_init(); + kmeminit(); + cpu_startup(); + + /* + * Initialize process and pgrp structures. + */ + procinit(); + + /* + * Create process 0 (the swapper). + */ + LIST_INSERT_HEAD(&allproc, p, p_list); + p->p_pgrp = &pgrp0; + LIST_INSERT_HEAD(PGRPHASH(0), &pgrp0, pg_hash); + LIST_INIT(&pgrp0.pg_members); + LIST_INSERT_HEAD(&pgrp0.pg_members, p, p_pglist); + + pgrp0.pg_session = &session0; + session0.s_count = 1; + session0.s_leader = p; + + p->p_flag = P_INMEM | P_SYSTEM; + p->p_stat = SRUN; + p->p_nice = NZERO; + bcopy("swapper", p->p_comm, sizeof ("swapper")); + + /* Create credentials. */ + cred0.p_refcnt = 1; + p->p_cred = &cred0; + p->p_ucred = crget(); + p->p_ucred->cr_ngroups = 1; /* group 0 */ + + /* Create the file descriptor table. */ + fdp = &filedesc0; + p->p_fd = &fdp->fd_fd; + fdp->fd_fd.fd_refcnt = 1; + fdp->fd_fd.fd_cmask = cmask; + fdp->fd_fd.fd_ofiles = fdp->fd_dfiles; + fdp->fd_fd.fd_ofileflags = fdp->fd_dfileflags; + fdp->fd_fd.fd_nfiles = NDFILE; + + /* Create the limits structures. */ + p->p_limit = &limit0; + for (i = 0; i < sizeof(p->p_rlimit)/sizeof(p->p_rlimit[0]); i++) + limit0.pl_rlimit[i].rlim_cur = + limit0.pl_rlimit[i].rlim_max = RLIM_INFINITY; + limit0.pl_rlimit[RLIMIT_NOFILE].rlim_cur = NOFILE; + limit0.pl_rlimit[RLIMIT_NPROC].rlim_cur = MAXUPRC; + i = ptoa(cnt.v_free_count); + limit0.pl_rlimit[RLIMIT_RSS].rlim_max = i; + limit0.pl_rlimit[RLIMIT_MEMLOCK].rlim_max = i; + limit0.pl_rlimit[RLIMIT_MEMLOCK].rlim_cur = i / 3; + limit0.p_refcnt = 1; + + /* Allocate a prototype map so we have something to fork. */ + p->p_vmspace = &vmspace0; + vmspace0.vm_refcnt = 1; + pmap_pinit(&vmspace0.vm_pmap); + vm_map_init(&p->p_vmspace->vm_map, round_page(VM_MIN_ADDRESS), + trunc_page(VM_MAX_ADDRESS), TRUE); + vmspace0.vm_map.pmap = &vmspace0.vm_pmap; + p->p_addr = proc0paddr; /* XXX */ + + /* + * We continue to place resource usage info and signal + * actions in the user struct so they're pageable. + */ + p->p_stats = &p->p_addr->u_stats; + p->p_sigacts = &p->p_addr->u_sigacts; + + /* + * Charge root for one process. + */ + (void)chgproccnt(0, 1); + + rqinit(); + + /* Configure virtual memory system, set vm rlimits. */ + vm_init_limits(p); + + /* Initialize the file systems. */ + vfsinit(); + + /* Start real time and statistics clocks. */ + initclocks(); + + /* Initialize mbuf's. */ + mbinit(); + + /* Initialize clists. */ + clist_init(); + +#ifdef SYSVSHM + /* Initialize System V style shared memory. */ + shminit(); +#endif + + /* Attach pseudo-devices. */ + for (pdev = pdevinit; pdev->pdev_attach != NULL; pdev++) + (*pdev->pdev_attach)(pdev->pdev_count); + + /* + * Initialize protocols. Block reception of incoming packets + * until everything is ready. + */ + s = splimp(); + ifinit(); + domaininit(); + splx(s); + +#ifdef GPROF + /* Initialize kernel profiling. */ + kmstartup(); +#endif + + /* Kick off timeout driven events by calling first time. */ + roundrobin(NULL); + schedcpu(NULL); + + /* Mount the root file system. */ + if (vfs_mountroot()) + panic("cannot mount root"); + mountlist.cqh_first->mnt_flag |= MNT_ROOTFS; + + /* Get the vnode for '/'. Set fdp->fd_fd.fd_cdir to reference it. */ + if (VFS_ROOT(mountlist.cqh_first, &rootvnode)) + panic("cannot find root vnode"); + fdp->fd_fd.fd_cdir = rootvnode; + VREF(fdp->fd_fd.fd_cdir); + VOP_UNLOCK(rootvnode, 0, p); + fdp->fd_fd.fd_rdir = NULL; + swapinit(); + + /* + * Now can look at time, having had a chance to verify the time + * from the file system. Reset p->p_rtime as it may have been + * munched in mi_switch() after the time got set. + */ + p->p_stats->p_start = runtime = mono_time = boottime = time; + p->p_rtime.tv_sec = p->p_rtime.tv_usec = 0; + + /* Initialize signal state for process 0. */ + siginit(p); + + /* Create process 1 (init(8)). */ + if (fork(p, NULL, rval)) + panic("fork init"); + if (rval[1]) { + start_init(curproc, framep); + return; + } + + /* Create process 2 (the pageout daemon). */ + if (fork(p, NULL, rval)) + panic("fork pager"); + if (rval[1]) { + /* + * Now in process 2. + */ + p = curproc; + pageproc = p; + p->p_flag |= P_INMEM | P_SYSTEM; /* XXX */ + bcopy("pagedaemon", curproc->p_comm, sizeof ("pagedaemon")); + vm_pageout(); + /* NOTREACHED */ + } + + /* The scheduler is an infinite loop. */ + scheduler(); + /* NOTREACHED */ +} + +/* + * List of paths to try when searching for "init". + */ +static char *initpaths[] = { + "/sbin/init", + "/sbin/oinit", + "/sbin/init.bak", + NULL, +}; + +/* + * Start the initial user process; try exec'ing each pathname in "initpaths". + * The program is invoked with one argument containing the boot flags. + */ +static void +start_init(p, framep) + struct proc *p; + void *framep; +{ + vm_offset_t addr; + struct execve_args /* { + syscallarg(char *) path; + syscallarg(char **) argp; + syscallarg(char **) envp; + } */ args; + int options, i, error; + register_t retval[2]; + char flags[4] = "-", *flagsp; + char **pathp, *path, *ucp, **uap, *arg0, *arg1; + + initproc = p; + + /* + * We need to set the system call frame as if we were entered through + * a syscall() so that when we call execve() below, it will be able + * to set the entry point (see setregs) when it tries to exec. The + * startup code in "locore.s" has allocated space for the frame and + * passed a pointer to that space as main's argument. + */ + cpu_set_init_frame(p, framep); + + /* + * Need just enough stack to hold the faked-up "execve()" arguments. + */ + addr = trunc_page(VM_MAX_ADDRESS - PAGE_SIZE); + if (vm_allocate(&p->p_vmspace->vm_map, &addr, PAGE_SIZE, FALSE) != 0) + panic("init: couldn't allocate argument space"); + p->p_vmspace->vm_maxsaddr = (caddr_t)addr; + + for (pathp = &initpaths[0]; (path = *pathp) != NULL; pathp++) { + /* + * Construct the boot flag argument. + */ + options = 0; + flagsp = flags + 1; + ucp = (char *)USRSTACK; + if (boothowto & RB_SINGLE) { + *flagsp++ = 's'; + options = 1; + } +#ifdef notyet + if (boothowto & RB_FASTBOOT) { + *flagsp++ = 'f'; + options = 1; + } +#endif + /* + * Move out the flags (arg 1), if necessary. + */ + if (options != 0) { + *flagsp++ = '\0'; + i = flagsp - flags; + (void)copyout((caddr_t)flags, (caddr_t)(ucp -= i), i); + arg1 = ucp; + } + + /* + * Move out the file name (also arg 0). + */ + i = strlen(path) + 1; + (void)copyout((caddr_t)path, (caddr_t)(ucp -= i), i); + arg0 = ucp; + + /* + * Move out the arg pointers. + */ + uap = (char **)((long)ucp & ~ALIGNBYTES); + (void)suword((caddr_t)--uap, 0); /* terminator */ + if (options != 0) + (void)suword((caddr_t)--uap, (long)arg1); + (void)suword((caddr_t)--uap, (long)arg0); + + /* + * Point at the arguments. + */ + SCARG(&args, path) = arg0; + SCARG(&args, argp) = uap; + SCARG(&args, envp) = NULL; + + /* + * Now try to exec the program. If can't for any reason + * other than it doesn't exist, complain. + */ + if ((error = execve(p, &args, retval)) == 0) + return; + if (error != ENOENT) + printf("exec %s: error %d\n", path, error); + } + printf("init: not found\n"); + panic("no init"); +} diff --git a/sys/kern/init_sysent.c b/sys/kern/init_sysent.c new file mode 100644 index 000000000000..0bbdd2025b6b --- /dev/null +++ b/sys/kern/init_sysent.c @@ -0,0 +1,767 @@ +/* + * System call switch table. + * + * DO NOT EDIT-- this file is automatically generated. + * created from @(#)syscalls.master 8.6 (Berkeley) 3/30/95 + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/signal.h> +#include <sys/mount.h> +#include <sys/syscallargs.h> +int nosys(); +int exit(); +int fork(); +int read(); +int write(); +int open(); +int close(); +int wait4(); +int link(); +int unlink(); +int chdir(); +int fchdir(); +int mknod(); +int chmod(); +int chown(); +int obreak(); +int getfsstat(); +int getpid(); +int mount(); +int unmount(); +int setuid(); +int getuid(); +int geteuid(); +int ptrace(); +int recvmsg(); +int sendmsg(); +int recvfrom(); +int accept(); +int getpeername(); +int getsockname(); +int access(); +int chflags(); +int fchflags(); +int sync(); +int kill(); +int getppid(); +int dup(); +int pipe(); +int getegid(); +int profil(); +#ifdef KTRACE +int ktrace(); +#else +#endif +int sigaction(); +int getgid(); +int sigprocmask(); +int getlogin(); +int setlogin(); +int acct(); +int sigpending(); +int sigaltstack(); +int ioctl(); +int reboot(); +int revoke(); +int symlink(); +int readlink(); +int execve(); +int umask(); +int chroot(); +int msync(); +int vfork(); +int sbrk(); +int sstk(); +int ovadvise(); +int munmap(); +int mprotect(); +int madvise(); +int mincore(); +int getgroups(); +int setgroups(); +int getpgrp(); +int setpgid(); +int setitimer(); +int swapon(); +int getitimer(); +int getdtablesize(); +int dup2(); +int fcntl(); +int select(); +int fsync(); +int setpriority(); +int socket(); +int connect(); +int getpriority(); +int sigreturn(); +int bind(); +int setsockopt(); +int listen(); +int sigsuspend(); +#ifdef TRACE +int vtrace(); +#else +#endif +int gettimeofday(); +int getrusage(); +int getsockopt(); +#ifdef vax +int resuba(); +#else +#endif +int readv(); +int writev(); +int settimeofday(); +int fchown(); +int fchmod(); +int rename(); +int flock(); +int mkfifo(); +int sendto(); +int shutdown(); +int socketpair(); +int mkdir(); +int rmdir(); +int utimes(); +int adjtime(); +int setsid(); +int quotactl(); +#ifdef NFS +int nfssvc(); +#else +#endif +int statfs(); +int fstatfs(); +#ifdef NFS +int getfh(); +#else +#endif +#if defined(SYSVSHM) && !defined(alpha) +#else +#endif +int setgid(); +int setegid(); +int seteuid(); +#ifdef LFS +int lfs_bmapv(); +int lfs_markv(); +int lfs_segclean(); +int lfs_segwait(); +#else +#endif +int stat(); +int fstat(); +int lstat(); +int pathconf(); +int fpathconf(); +int getrlimit(); +int setrlimit(); +int getdirentries(); +int mmap(); +int nosys(); +int lseek(); +int truncate(); +int ftruncate(); +int __sysctl(); +int mlock(); +int munlock(); +int undelete(); +#if defined(SYSVSHM) && 0 +int shmat(); +int shmctl(); +int shmdt(); +int shmget(); +#else +#endif + +#ifdef COMPAT_43 +#define compat_43(func) __CONCAT(compat_43_,func) + +int compat_43(creat)(); +int compat_43(lseek)(); +int compat_43(stat)(); +int compat_43(lstat)(); +#ifdef KTRACE +#else +#endif +int compat_43(fstat)(); +int compat_43(getkerninfo)(); +int compat_43(getpagesize)(); +int compat_43(mmap)(); +int compat_43(wait)(); +int compat_43(gethostname)(); +int compat_43(sethostname)(); +int compat_43(accept)(); +int compat_43(send)(); +int compat_43(recv)(); +int compat_43(sigvec)(); +int compat_43(sigblock)(); +int compat_43(sigsetmask)(); +int compat_43(sigstack)(); +int compat_43(recvmsg)(); +int compat_43(sendmsg)(); +#ifdef TRACE +#else +#endif +#ifdef vax +#else +#endif +int compat_43(recvfrom)(); +int compat_43(setreuid)(); +int compat_43(setregid)(); +int compat_43(truncate)(); +int compat_43(ftruncate)(); +int compat_43(getpeername)(); +int compat_43(gethostid)(); +int compat_43(sethostid)(); +int compat_43(getrlimit)(); +int compat_43(setrlimit)(); +int compat_43(killpg)(); +int compat_43(quota)(); +int compat_43(getsockname)(); +#ifdef NFS +#else +#endif +int compat_43(getdirentries)(); +#ifdef NFS +#else +#endif +#if defined(SYSVSHM) && !defined(alpha) +int compat_43(shmsys)(); +#else +#endif +#ifdef LFS +#else +#endif +#if defined(SYSVSHM) && 0 +#else +#endif + +#else /* COMPAT_43 */ +#define compat_43(func) nosys +#endif /* COMPAT_43 */ + +#define s(type) sizeof(type) + +struct sysent sysent[] = { + { 0, 0, + nosys }, /* 0 = syscall */ + { 1, s(struct exit_args), + exit }, /* 1 = exit */ + { 0, 0, + fork }, /* 2 = fork */ + { 3, s(struct read_args), + read }, /* 3 = read */ + { 3, s(struct write_args), + write }, /* 4 = write */ + { 3, s(struct open_args), + open }, /* 5 = open */ + { 1, s(struct close_args), + close }, /* 6 = close */ + { 4, s(struct wait4_args), + wait4 }, /* 7 = wait4 */ + { 2, s(struct compat_43_creat_args), + compat_43(creat) }, /* 8 = compat_43 creat */ + { 2, s(struct link_args), + link }, /* 9 = link */ + { 1, s(struct unlink_args), + unlink }, /* 10 = unlink */ + { 0, 0, + nosys }, /* 11 = obsolete execv */ + { 1, s(struct chdir_args), + chdir }, /* 12 = chdir */ + { 1, s(struct fchdir_args), + fchdir }, /* 13 = fchdir */ + { 3, s(struct mknod_args), + mknod }, /* 14 = mknod */ + { 2, s(struct chmod_args), + chmod }, /* 15 = chmod */ + { 3, s(struct chown_args), + chown }, /* 16 = chown */ + { 1, s(struct obreak_args), + obreak }, /* 17 = break */ + { 3, s(struct getfsstat_args), + getfsstat }, /* 18 = getfsstat */ + { 3, s(struct compat_43_lseek_args), + compat_43(lseek) }, /* 19 = compat_43 lseek */ + { 0, 0, + getpid }, /* 20 = getpid */ + { 4, s(struct mount_args), + mount }, /* 21 = mount */ + { 2, s(struct unmount_args), + unmount }, /* 22 = unmount */ + { 1, s(struct setuid_args), + setuid }, /* 23 = setuid */ + { 0, 0, + getuid }, /* 24 = getuid */ + { 0, 0, + geteuid }, /* 25 = geteuid */ + { 4, s(struct ptrace_args), + ptrace }, /* 26 = ptrace */ + { 3, s(struct recvmsg_args), + recvmsg }, /* 27 = recvmsg */ + { 3, s(struct sendmsg_args), + sendmsg }, /* 28 = sendmsg */ + { 6, s(struct recvfrom_args), + recvfrom }, /* 29 = recvfrom */ + { 3, s(struct accept_args), + accept }, /* 30 = accept */ + { 3, s(struct getpeername_args), + getpeername }, /* 31 = getpeername */ + { 3, s(struct getsockname_args), + getsockname }, /* 32 = getsockname */ + { 2, s(struct access_args), + access }, /* 33 = access */ + { 2, s(struct chflags_args), + chflags }, /* 34 = chflags */ + { 2, s(struct fchflags_args), + fchflags }, /* 35 = fchflags */ + { 0, 0, + sync }, /* 36 = sync */ + { 2, s(struct kill_args), + kill }, /* 37 = kill */ + { 2, s(struct compat_43_stat_args), + compat_43(stat) }, /* 38 = compat_43 stat */ + { 0, 0, + getppid }, /* 39 = getppid */ + { 2, s(struct compat_43_lstat_args), + compat_43(lstat) }, /* 40 = compat_43 lstat */ + { 1, s(struct dup_args), + dup }, /* 41 = dup */ + { 0, 0, + pipe }, /* 42 = pipe */ + { 0, 0, + getegid }, /* 43 = getegid */ + { 4, s(struct profil_args), + profil }, /* 44 = profil */ +#ifdef KTRACE + { 4, s(struct ktrace_args), + ktrace }, /* 45 = ktrace */ +#else + { 0, 0, + nosys }, /* 45 = unimplemented ktrace */ +#endif + { 3, s(struct sigaction_args), + sigaction }, /* 46 = sigaction */ + { 0, 0, + getgid }, /* 47 = getgid */ + { 2, s(struct sigprocmask_args), + sigprocmask }, /* 48 = sigprocmask */ + { 2, s(struct getlogin_args), + getlogin }, /* 49 = getlogin */ + { 1, s(struct setlogin_args), + setlogin }, /* 50 = setlogin */ + { 1, s(struct acct_args), + acct }, /* 51 = acct */ + { 0, 0, + sigpending }, /* 52 = sigpending */ + { 2, s(struct sigaltstack_args), + sigaltstack }, /* 53 = sigaltstack */ + { 3, s(struct ioctl_args), + ioctl }, /* 54 = ioctl */ + { 1, s(struct reboot_args), + reboot }, /* 55 = reboot */ + { 1, s(struct revoke_args), + revoke }, /* 56 = revoke */ + { 2, s(struct symlink_args), + symlink }, /* 57 = symlink */ + { 3, s(struct readlink_args), + readlink }, /* 58 = readlink */ + { 3, s(struct execve_args), + execve }, /* 59 = execve */ + { 1, s(struct umask_args), + umask }, /* 60 = umask */ + { 1, s(struct chroot_args), + chroot }, /* 61 = chroot */ + { 2, s(struct compat_43_fstat_args), + compat_43(fstat) }, /* 62 = compat_43 fstat */ + { 4, s(struct compat_43_getkerninfo_args), + compat_43(getkerninfo) }, /* 63 = compat_43 getkerninfo */ + { 0, 0, + compat_43(getpagesize) }, /* 64 = compat_43 getpagesize */ + { 2, s(struct msync_args), + msync }, /* 65 = msync */ + { 0, 0, + vfork }, /* 66 = vfork */ + { 0, 0, + nosys }, /* 67 = obsolete vread */ + { 0, 0, + nosys }, /* 68 = obsolete vwrite */ + { 1, s(struct sbrk_args), + sbrk }, /* 69 = sbrk */ + { 1, s(struct sstk_args), + sstk }, /* 70 = sstk */ + { 6, s(struct compat_43_mmap_args), + compat_43(mmap) }, /* 71 = compat_43 mmap */ + { 1, s(struct ovadvise_args), + ovadvise }, /* 72 = vadvise */ + { 2, s(struct munmap_args), + munmap }, /* 73 = munmap */ + { 3, s(struct mprotect_args), + mprotect }, /* 74 = mprotect */ + { 3, s(struct madvise_args), + madvise }, /* 75 = madvise */ + { 0, 0, + nosys }, /* 76 = obsolete vhangup */ + { 0, 0, + nosys }, /* 77 = obsolete vlimit */ + { 3, s(struct mincore_args), + mincore }, /* 78 = mincore */ + { 2, s(struct getgroups_args), + getgroups }, /* 79 = getgroups */ + { 2, s(struct setgroups_args), + setgroups }, /* 80 = setgroups */ + { 0, 0, + getpgrp }, /* 81 = getpgrp */ + { 2, s(struct setpgid_args), + setpgid }, /* 82 = setpgid */ + { 3, s(struct setitimer_args), + setitimer }, /* 83 = setitimer */ + { 0, 0, + compat_43(wait) }, /* 84 = compat_43 wait */ + { 1, s(struct swapon_args), + swapon }, /* 85 = swapon */ + { 2, s(struct getitimer_args), + getitimer }, /* 86 = getitimer */ + { 2, s(struct compat_43_gethostname_args), + compat_43(gethostname) }, /* 87 = compat_43 gethostname */ + { 2, s(struct compat_43_sethostname_args), + compat_43(sethostname) }, /* 88 = compat_43 sethostname */ + { 0, 0, + getdtablesize }, /* 89 = getdtablesize */ + { 2, s(struct dup2_args), + dup2 }, /* 90 = dup2 */ + { 0, 0, + nosys }, /* 91 = unimplemented getdopt */ + { 3, s(struct fcntl_args), + fcntl }, /* 92 = fcntl */ + { 5, s(struct select_args), + select }, /* 93 = select */ + { 0, 0, + nosys }, /* 94 = unimplemented setdopt */ + { 1, s(struct fsync_args), + fsync }, /* 95 = fsync */ + { 3, s(struct setpriority_args), + setpriority }, /* 96 = setpriority */ + { 3, s(struct socket_args), + socket }, /* 97 = socket */ + { 3, s(struct connect_args), + connect }, /* 98 = connect */ + { 3, s(struct compat_43_accept_args), + compat_43(accept) }, /* 99 = compat_43 accept */ + { 2, s(struct getpriority_args), + getpriority }, /* 100 = getpriority */ + { 4, s(struct compat_43_send_args), + compat_43(send) }, /* 101 = compat_43 send */ + { 4, s(struct compat_43_recv_args), + compat_43(recv) }, /* 102 = compat_43 recv */ + { 1, s(struct sigreturn_args), + sigreturn }, /* 103 = sigreturn */ + { 3, s(struct bind_args), + bind }, /* 104 = bind */ + { 5, s(struct setsockopt_args), + setsockopt }, /* 105 = setsockopt */ + { 2, s(struct listen_args), + listen }, /* 106 = listen */ + { 0, 0, + nosys }, /* 107 = obsolete vtimes */ + { 3, s(struct compat_43_sigvec_args), + compat_43(sigvec) }, /* 108 = compat_43 sigvec */ + { 1, s(struct compat_43_sigblock_args), + compat_43(sigblock) }, /* 109 = compat_43 sigblock */ + { 1, s(struct compat_43_sigsetmask_args), + compat_43(sigsetmask) }, /* 110 = compat_43 sigsetmask */ + { 1, s(struct sigsuspend_args), + sigsuspend }, /* 111 = sigsuspend */ + { 2, s(struct compat_43_sigstack_args), + compat_43(sigstack) }, /* 112 = compat_43 sigstack */ + { 3, s(struct compat_43_recvmsg_args), + compat_43(recvmsg) }, /* 113 = compat_43 recvmsg */ + { 3, s(struct compat_43_sendmsg_args), + compat_43(sendmsg) }, /* 114 = compat_43 sendmsg */ +#ifdef TRACE + { 2, s(struct vtrace_args), + vtrace }, /* 115 = vtrace */ +#else + { 0, 0, + nosys }, /* 115 = obsolete vtrace */ +#endif + { 2, s(struct gettimeofday_args), + gettimeofday }, /* 116 = gettimeofday */ + { 2, s(struct getrusage_args), + getrusage }, /* 117 = getrusage */ + { 5, s(struct getsockopt_args), + getsockopt }, /* 118 = getsockopt */ +#ifdef vax + { 1, s(struct resuba_args), + resuba }, /* 119 = resuba */ +#else + { 0, 0, + nosys }, /* 119 = unimplemented resuba */ +#endif + { 3, s(struct readv_args), + readv }, /* 120 = readv */ + { 3, s(struct writev_args), + writev }, /* 121 = writev */ + { 2, s(struct settimeofday_args), + settimeofday }, /* 122 = settimeofday */ + { 3, s(struct fchown_args), + fchown }, /* 123 = fchown */ + { 2, s(struct fchmod_args), + fchmod }, /* 124 = fchmod */ + { 6, s(struct compat_43_recvfrom_args), + compat_43(recvfrom) }, /* 125 = compat_43 recvfrom */ + { 2, s(struct compat_43_setreuid_args), + compat_43(setreuid) }, /* 126 = compat_43 setreuid */ + { 2, s(struct compat_43_setregid_args), + compat_43(setregid) }, /* 127 = compat_43 setregid */ + { 2, s(struct rename_args), + rename }, /* 128 = rename */ + { 2, s(struct compat_43_truncate_args), + compat_43(truncate) }, /* 129 = compat_43 truncate */ + { 2, s(struct compat_43_ftruncate_args), + compat_43(ftruncate) }, /* 130 = compat_43 ftruncate */ + { 2, s(struct flock_args), + flock }, /* 131 = flock */ + { 2, s(struct mkfifo_args), + mkfifo }, /* 132 = mkfifo */ + { 6, s(struct sendto_args), + sendto }, /* 133 = sendto */ + { 2, s(struct shutdown_args), + shutdown }, /* 134 = shutdown */ + { 4, s(struct socketpair_args), + socketpair }, /* 135 = socketpair */ + { 2, s(struct mkdir_args), + mkdir }, /* 136 = mkdir */ + { 1, s(struct rmdir_args), + rmdir }, /* 137 = rmdir */ + { 2, s(struct utimes_args), + utimes }, /* 138 = utimes */ + { 0, 0, + nosys }, /* 139 = obsolete 4.2 sigreturn */ + { 2, s(struct adjtime_args), + adjtime }, /* 140 = adjtime */ + { 3, s(struct compat_43_getpeername_args), + compat_43(getpeername) }, /* 141 = compat_43 getpeername */ + { 0, 0, + compat_43(gethostid) }, /* 142 = compat_43 gethostid */ + { 1, s(struct compat_43_sethostid_args), + compat_43(sethostid) }, /* 143 = compat_43 sethostid */ + { 2, s(struct compat_43_getrlimit_args), + compat_43(getrlimit) }, /* 144 = compat_43 getrlimit */ + { 2, s(struct compat_43_setrlimit_args), + compat_43(setrlimit) }, /* 145 = compat_43 setrlimit */ + { 2, s(struct compat_43_killpg_args), + compat_43(killpg) }, /* 146 = compat_43 killpg */ + { 0, 0, + setsid }, /* 147 = setsid */ + { 4, s(struct quotactl_args), + quotactl }, /* 148 = quotactl */ + { 0, 0, + compat_43(quota) }, /* 149 = compat_43 quota */ + { 3, s(struct compat_43_getsockname_args), + compat_43(getsockname) }, /* 150 = compat_43 getsockname */ + { 0, 0, + nosys }, /* 151 = unimplemented */ + { 0, 0, + nosys }, /* 152 = unimplemented */ + { 0, 0, + nosys }, /* 153 = unimplemented */ + { 0, 0, + nosys }, /* 154 = unimplemented */ +#ifdef NFS + { 2, s(struct nfssvc_args), + nfssvc }, /* 155 = nfssvc */ +#else + { 0, 0, + nosys }, /* 155 = unimplemented nfssvc */ +#endif + { 4, s(struct compat_43_getdirentries_args), + compat_43(getdirentries) }, /* 156 = compat_43 getdirentries */ + { 2, s(struct statfs_args), + statfs }, /* 157 = statfs */ + { 2, s(struct fstatfs_args), + fstatfs }, /* 158 = fstatfs */ + { 0, 0, + nosys }, /* 159 = unimplemented */ + { 0, 0, + nosys }, /* 160 = unimplemented */ +#ifdef NFS + { 2, s(struct getfh_args), + getfh }, /* 161 = getfh */ +#else + { 0, 0, + nosys }, /* 161 = unimplemented getfh */ +#endif + { 0, 0, + nosys }, /* 162 = unimplemented getdomainname */ + { 0, 0, + nosys }, /* 163 = unimplemented setdomainname */ + { 0, 0, + nosys }, /* 164 = unimplemented */ + { 0, 0, + nosys }, /* 165 = unimplemented */ + { 0, 0, + nosys }, /* 166 = unimplemented */ + { 0, 0, + nosys }, /* 167 = unimplemented */ + { 0, 0, + nosys }, /* 168 = unimplemented */ + { 0, 0, + nosys }, /* 169 = unimplemented semsys */ + { 0, 0, + nosys }, /* 170 = unimplemented msgsys */ +#if defined(SYSVSHM) && !defined(alpha) + { 4, s(struct compat_43_shmsys_args), + compat_43(shmsys) }, /* 171 = compat_43 shmsys */ +#else + { 0, 0, + nosys }, /* 171 = unimplemented shmsys */ +#endif + { 0, 0, + nosys }, /* 172 = unimplemented */ + { 0, 0, + nosys }, /* 173 = unimplemented */ + { 0, 0, + nosys }, /* 174 = unimplemented */ + { 0, 0, + nosys }, /* 175 = unimplemented */ + { 0, 0, + nosys }, /* 176 = unimplemented */ + { 0, 0, + nosys }, /* 177 = unimplemented */ + { 0, 0, + nosys }, /* 178 = unimplemented */ + { 0, 0, + nosys }, /* 179 = unimplemented */ + { 0, 0, + nosys }, /* 180 = unimplemented */ + { 1, s(struct setgid_args), + setgid }, /* 181 = setgid */ + { 1, s(struct setegid_args), + setegid }, /* 182 = setegid */ + { 1, s(struct seteuid_args), + seteuid }, /* 183 = seteuid */ +#ifdef LFS + { 3, s(struct lfs_bmapv_args), + lfs_bmapv }, /* 184 = lfs_bmapv */ + { 3, s(struct lfs_markv_args), + lfs_markv }, /* 185 = lfs_markv */ + { 2, s(struct lfs_segclean_args), + lfs_segclean }, /* 186 = lfs_segclean */ + { 2, s(struct lfs_segwait_args), + lfs_segwait }, /* 187 = lfs_segwait */ +#else + { 0, 0, + nosys }, /* 184 = unimplemented lfs_bmapv */ + { 0, 0, + nosys }, /* 185 = unimplemented lfs_markv */ + { 0, 0, + nosys }, /* 186 = unimplemented lfs_segclean */ + { 0, 0, + nosys }, /* 187 = unimplemented lfs_segwait */ +#endif + { 2, s(struct stat_args), + stat }, /* 188 = stat */ + { 2, s(struct fstat_args), + fstat }, /* 189 = fstat */ + { 2, s(struct lstat_args), + lstat }, /* 190 = lstat */ + { 2, s(struct pathconf_args), + pathconf }, /* 191 = pathconf */ + { 2, s(struct fpathconf_args), + fpathconf }, /* 192 = fpathconf */ + { 0, 0, + nosys }, /* 193 = unimplemented */ + { 2, s(struct getrlimit_args), + getrlimit }, /* 194 = getrlimit */ + { 2, s(struct setrlimit_args), + setrlimit }, /* 195 = setrlimit */ + { 4, s(struct getdirentries_args), + getdirentries }, /* 196 = getdirentries */ + { 7, s(struct mmap_args), + mmap }, /* 197 = mmap */ + { 0, 0, + nosys }, /* 198 = __syscall */ + { 4, s(struct lseek_args), + lseek }, /* 199 = lseek */ + { 3, s(struct truncate_args), + truncate }, /* 200 = truncate */ + { 3, s(struct ftruncate_args), + ftruncate }, /* 201 = ftruncate */ + { 6, s(struct __sysctl_args), + __sysctl }, /* 202 = __sysctl */ + { 2, s(struct mlock_args), + mlock }, /* 203 = mlock */ + { 2, s(struct munlock_args), + munlock }, /* 204 = munlock */ + { 1, s(struct undelete_args), + undelete }, /* 205 = undelete */ + { 0, 0, + nosys }, /* 206 = unimplemented */ + { 0, 0, + nosys }, /* 207 = unimplemented */ + { 0, 0, + nosys }, /* 208 = unimplemented */ + { 0, 0, + nosys }, /* 209 = unimplemented */ + { 0, 0, + nosys }, /* 210 = unimplemented */ + { 0, 0, + nosys }, /* 211 = unimplemented */ + { 0, 0, + nosys }, /* 212 = unimplemented */ + { 0, 0, + nosys }, /* 213 = unimplemented */ + { 0, 0, + nosys }, /* 214 = unimplemented */ + { 0, 0, + nosys }, /* 215 = unimplemented */ + { 0, 0, + nosys }, /* 216 = unimplemented */ + { 0, 0, + nosys }, /* 217 = unimplemented */ + { 0, 0, + nosys }, /* 218 = unimplemented */ + { 0, 0, + nosys }, /* 219 = unimplemented */ + { 0, 0, + nosys }, /* 220 = unimplemented semctl */ + { 0, 0, + nosys }, /* 221 = unimplemented semget */ + { 0, 0, + nosys }, /* 222 = unimplemented semop */ + { 0, 0, + nosys }, /* 223 = unimplemented semconfig */ + { 0, 0, + nosys }, /* 224 = unimplemented msgctl */ + { 0, 0, + nosys }, /* 225 = unimplemented msgget */ + { 0, 0, + nosys }, /* 226 = unimplemented msgsnd */ + { 0, 0, + nosys }, /* 227 = unimplemented msgrcv */ +#if defined(SYSVSHM) && 0 + { 3, s(struct shmat_args), + shmat }, /* 228 = shmat */ + { 3, s(struct shmctl_args), + shmctl }, /* 229 = shmctl */ + { 1, s(struct shmdt_args), + shmdt }, /* 230 = shmdt */ + { 3, s(struct shmget_args), + shmget }, /* 231 = shmget */ +#else + { 0, 0, + nosys }, /* 228 = unimplemented shmat */ + { 0, 0, + nosys }, /* 229 = unimplemented shmctl */ + { 0, 0, + nosys }, /* 230 = unimplemented shmdt */ + { 0, 0, + nosys }, /* 231 = unimplemented shmget */ +#endif +}; + +int nsysent= sizeof(sysent) / sizeof(sysent[0]); diff --git a/sys/kern/kern_acct.c b/sys/kern/kern_acct.c new file mode 100644 index 000000000000..a23543ce0e38 --- /dev/null +++ b/sys/kern/kern_acct.c @@ -0,0 +1,127 @@ +/*- + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: @(#)kern_acct.c 8.8 (Berkeley) 5/14/95 + */ + +#include <sys/param.h> +#include <sys/proc.h> +#include <sys/mount.h> +#include <sys/vnode.h> +#include <sys/file.h> +#include <sys/syslog.h> +#include <sys/kernel.h> + +acct(a1, a2, a3) + struct proc *a1; + struct acct_args /* { + syscallarg(char *) path; + } */ *a2; + int *a3; +{ + /* + * Body deleted. + */ + return (ENOSYS); +} + +acct_process(a1) + struct proc *a1; +{ + + /* + * Body deleted. + */ + return; +} + +/* + * Periodically check the file system to see if accounting + * should be turned on or off. Beware the case where the vnode + * has been vgone()'d out from underneath us, e.g. when the file + * system containing the accounting file has been forcibly unmounted. + */ + +/* + * Values associated with enabling and disabling accounting + */ +int acctsuspend = 2; /* stop accounting when < 2% free space left */ +int acctresume = 4; /* resume when free space risen to > 4% */ +int acctchkfreq = 15; /* frequency (in seconds) to check space */ + +/* + * SHOULD REPLACE THIS WITH A DRIVER THAT CAN BE READ TO SIMPLIFY. + */ +struct vnode *acctp; +struct vnode *savacctp; + +/* ARGSUSED */ +void +acctwatch(a) + void *a; +{ + struct statfs sb; + + if (savacctp) { + if (savacctp->v_type == VBAD) { + (void) vn_close(savacctp, FWRITE, NOCRED, NULL); + savacctp = NULL; + return; + } + (void)VFS_STATFS(savacctp->v_mount, &sb, (struct proc *)0); + if (sb.f_bavail > acctresume * sb.f_blocks / 100) { + acctp = savacctp; + savacctp = NULL; + log(LOG_NOTICE, "Accounting resumed\n"); + } + } else { + if (acctp == NULL) + return; + if (acctp->v_type == VBAD) { + (void) vn_close(acctp, FWRITE, NOCRED, NULL); + acctp = NULL; + return; + } + (void)VFS_STATFS(acctp->v_mount, &sb, (struct proc *)0); + if (sb.f_bavail <= acctsuspend * sb.f_blocks / 100) { + savacctp = acctp; + acctp = NULL; + log(LOG_NOTICE, "Accounting suspended\n"); + } + } + timeout(acctwatch, NULL, acctchkfreq * hz); +} diff --git a/sys/kern/kern_descrip.c b/sys/kern/kern_descrip.c new file mode 100644 index 000000000000..3f2e4241b49a --- /dev/null +++ b/sys/kern/kern_descrip.c @@ -0,0 +1,930 @@ +/* + * Copyright (c) 1982, 1986, 1989, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)kern_descrip.c 8.8 (Berkeley) 2/14/95 + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/filedesc.h> +#include <sys/kernel.h> +#include <sys/vnode.h> +#include <sys/proc.h> +#include <sys/file.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/stat.h> +#include <sys/ioctl.h> +#include <sys/fcntl.h> +#include <sys/malloc.h> +#include <sys/syslog.h> +#include <sys/unistd.h> +#include <sys/resourcevar.h> + +#include <sys/mount.h> +#include <sys/syscallargs.h> + +/* + * Descriptor management. + */ +struct filelist filehead; /* head of list of open files */ +int nfiles; /* actual number of open files */ + +/* + * System calls on descriptors. + */ +/* ARGSUSED */ +int +getdtablesize(p, uap, retval) + struct proc *p; + void *uap; + register_t *retval; +{ + + *retval = min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfiles); + return (0); +} + +/* + * Duplicate a file descriptor. + */ +/* ARGSUSED */ +int +dup(p, uap, retval) + struct proc *p; + struct dup_args /* { + syscallarg(u_int) fd; + } */ *uap; + register_t *retval; +{ + register struct filedesc *fdp; + u_int old; + int new, error; + + old = SCARG(uap, fd); + /* + * XXX Compatibility + */ + if (old &~ 077) { + SCARG(uap, fd) &= 077; + return (dup2(p, uap, retval)); + } + + fdp = p->p_fd; + if (old >= fdp->fd_nfiles || fdp->fd_ofiles[old] == NULL) + return (EBADF); + if (error = fdalloc(p, 0, &new)) + return (error); + return (finishdup(fdp, (int)old, new, retval)); +} + +/* + * Duplicate a file descriptor to a particular value. + */ +/* ARGSUSED */ +int +dup2(p, uap, retval) + struct proc *p; + struct dup2_args /* { + syscallarg(u_int) from; + syscallarg(u_int) to; + } */ *uap; + register_t *retval; +{ + register struct filedesc *fdp = p->p_fd; + register int old = SCARG(uap, from), new = SCARG(uap, to); + int i, error; + + if (old >= fdp->fd_nfiles || + fdp->fd_ofiles[old] == NULL || + new >= p->p_rlimit[RLIMIT_NOFILE].rlim_cur || + new >= maxfiles) + return (EBADF); + if (old == new) { + *retval = new; + return (0); + } + if (new >= fdp->fd_nfiles) { + if (error = fdalloc(p, new, &i)) + return (error); + if (new != i) + panic("dup2: fdalloc"); + } else if (fdp->fd_ofiles[new]) { + if (fdp->fd_ofileflags[new] & UF_MAPPED) + (void) munmapfd(p, new); + /* + * dup2() must succeed even if the close has an error. + */ + (void) closef(fdp->fd_ofiles[new], p); + } + return (finishdup(fdp, (int)old, (int)new, retval)); +} + +/* + * The file control system call. + */ +/* ARGSUSED */ +int +fcntl(p, uap, retval) + struct proc *p; + register struct fcntl_args /* { + syscallarg(int) fd; + syscallarg(int) cmd; + syscallarg(void *) arg; + } */ *uap; + register_t *retval; +{ + int fd = SCARG(uap, fd); + register struct filedesc *fdp = p->p_fd; + register struct file *fp; + register char *pop; + struct vnode *vp; + int i, tmp, error, flg = F_POSIX; + struct flock fl; + u_int newmin; + + if ((u_int)fd >= fdp->fd_nfiles || + (fp = fdp->fd_ofiles[fd]) == NULL) + return (EBADF); + pop = &fdp->fd_ofileflags[fd]; + switch (SCARG(uap, cmd)) { + + case F_DUPFD: + newmin = (long)SCARG(uap, arg); + if (newmin >= p->p_rlimit[RLIMIT_NOFILE].rlim_cur || + newmin >= maxfiles) + return (EINVAL); + if (error = fdalloc(p, newmin, &i)) + return (error); + return (finishdup(fdp, fd, i, retval)); + + case F_GETFD: + *retval = *pop & 1; + return (0); + + case F_SETFD: + *pop = (*pop &~ 1) | ((long)SCARG(uap, arg) & 1); + return (0); + + case F_GETFL: + *retval = OFLAGS(fp->f_flag); + return (0); + + case F_SETFL: + fp->f_flag &= ~FCNTLFLAGS; + fp->f_flag |= FFLAGS((long)SCARG(uap, arg)) & FCNTLFLAGS; + tmp = fp->f_flag & FNONBLOCK; + error = (*fp->f_ops->fo_ioctl)(fp, FIONBIO, (caddr_t)&tmp, p); + if (error) + return (error); + tmp = fp->f_flag & FASYNC; + error = (*fp->f_ops->fo_ioctl)(fp, FIOASYNC, (caddr_t)&tmp, p); + if (!error) + return (0); + fp->f_flag &= ~FNONBLOCK; + tmp = 0; + (void) (*fp->f_ops->fo_ioctl)(fp, FIONBIO, (caddr_t)&tmp, p); + return (error); + + case F_GETOWN: + if (fp->f_type == DTYPE_SOCKET) { + *retval = ((struct socket *)fp->f_data)->so_pgid; + return (0); + } + error = (*fp->f_ops->fo_ioctl) + (fp, TIOCGPGRP, (caddr_t)retval, p); + *retval = -*retval; + return (error); + + case F_SETOWN: + if (fp->f_type == DTYPE_SOCKET) { + ((struct socket *)fp->f_data)->so_pgid = + (long)SCARG(uap, arg); + return (0); + } + if ((long)SCARG(uap, arg) <= 0) { + SCARG(uap, arg) = (void *)(-(long)SCARG(uap, arg)); + } else { + struct proc *p1 = pfind((long)SCARG(uap, arg)); + if (p1 == 0) + return (ESRCH); + SCARG(uap, arg) = (void *)(long)p1->p_pgrp->pg_id; + } + return ((*fp->f_ops->fo_ioctl) + (fp, TIOCSPGRP, (caddr_t)&SCARG(uap, arg), p)); + + case F_SETLKW: + flg |= F_WAIT; + /* Fall into F_SETLK */ + + case F_SETLK: + if (fp->f_type != DTYPE_VNODE) + return (EBADF); + vp = (struct vnode *)fp->f_data; + /* Copy in the lock structure */ + error = copyin((caddr_t)SCARG(uap, arg), (caddr_t)&fl, + sizeof (fl)); + if (error) + return (error); + if (fl.l_whence == SEEK_CUR) + fl.l_start += fp->f_offset; + switch (fl.l_type) { + + case F_RDLCK: + if ((fp->f_flag & FREAD) == 0) + return (EBADF); + p->p_flag |= P_ADVLOCK; + return (VOP_ADVLOCK(vp, (caddr_t)p, F_SETLK, &fl, flg)); + + case F_WRLCK: + if ((fp->f_flag & FWRITE) == 0) + return (EBADF); + p->p_flag |= P_ADVLOCK; + return (VOP_ADVLOCK(vp, (caddr_t)p, F_SETLK, &fl, flg)); + + case F_UNLCK: + return (VOP_ADVLOCK(vp, (caddr_t)p, F_UNLCK, &fl, + F_POSIX)); + + default: + return (EINVAL); + } + + case F_GETLK: + if (fp->f_type != DTYPE_VNODE) + return (EBADF); + vp = (struct vnode *)fp->f_data; + /* Copy in the lock structure */ + error = copyin((caddr_t)SCARG(uap, arg), (caddr_t)&fl, + sizeof (fl)); + if (error) + return (error); + if (fl.l_whence == SEEK_CUR) + fl.l_start += fp->f_offset; + if (error = VOP_ADVLOCK(vp, (caddr_t)p, F_GETLK, &fl, F_POSIX)) + return (error); + return (copyout((caddr_t)&fl, (caddr_t)SCARG(uap, arg), + sizeof (fl))); + + default: + return (EINVAL); + } + /* NOTREACHED */ +} + +/* + * Common code for dup, dup2, and fcntl(F_DUPFD). + */ +int +finishdup(fdp, old, new, retval) + register struct filedesc *fdp; + register int old, new; + register_t *retval; +{ + register struct file *fp; + + fp = fdp->fd_ofiles[old]; + fdp->fd_ofiles[new] = fp; + fdp->fd_ofileflags[new] = fdp->fd_ofileflags[old] &~ UF_EXCLOSE; + fp->f_count++; + if (new > fdp->fd_lastfile) + fdp->fd_lastfile = new; + *retval = new; + return (0); +} + +/* + * Close a file descriptor. + */ +/* ARGSUSED */ +int +close(p, uap, retval) + struct proc *p; + struct close_args /* { + syscallarg(int) fd; + } */ *uap; + register_t *retval; +{ + int fd = SCARG(uap, fd); + register struct filedesc *fdp = p->p_fd; + register struct file *fp; + register u_char *pf; + + if ((u_int)fd >= fdp->fd_nfiles || + (fp = fdp->fd_ofiles[fd]) == NULL) + return (EBADF); + pf = (u_char *)&fdp->fd_ofileflags[fd]; + if (*pf & UF_MAPPED) + (void) munmapfd(p, fd); + fdp->fd_ofiles[fd] = NULL; + while (fdp->fd_lastfile > 0 && fdp->fd_ofiles[fdp->fd_lastfile] == NULL) + fdp->fd_lastfile--; + if (fd < fdp->fd_freefile) + fdp->fd_freefile = fd; + *pf = 0; + return (closef(fp, p)); +} + +#if defined(COMPAT_43) || defined(COMPAT_SUNOS) +/* + * Return status information about a file descriptor. + */ +/* ARGSUSED */ +int +compat_43_fstat(p, uap, retval) + struct proc *p; + register struct compat_43_fstat_args /* { + syscallarg(int) fd; + syscallarg(struct ostat *) sb; + } */ *uap; + register_t *retval; +{ + int fd = SCARG(uap, fd); + register struct filedesc *fdp = p->p_fd; + register struct file *fp; + struct stat ub; + struct ostat oub; + int error; + + if ((u_int)fd >= fdp->fd_nfiles || + (fp = fdp->fd_ofiles[fd]) == NULL) + return (EBADF); + switch (fp->f_type) { + + case DTYPE_VNODE: + error = vn_stat((struct vnode *)fp->f_data, &ub, p); + break; + + case DTYPE_SOCKET: + error = soo_stat((struct socket *)fp->f_data, &ub); + break; + + default: + panic("ofstat"); + /*NOTREACHED*/ + } + cvtstat(&ub, &oub); + if (error == 0) + error = copyout((caddr_t)&oub, (caddr_t)SCARG(uap, sb), + sizeof (oub)); + return (error); +} +#endif /* COMPAT_43 || COMPAT_SUNOS */ + +/* + * Return status information about a file descriptor. + */ +/* ARGSUSED */ +int +fstat(p, uap, retval) + struct proc *p; + register struct fstat_args /* { + syscallarg(int) fd; + syscallarg(struct stat *) sb; + } */ *uap; + register_t *retval; +{ + int fd = SCARG(uap, fd); + register struct filedesc *fdp = p->p_fd; + register struct file *fp; + struct stat ub; + int error; + + if ((u_int)fd >= fdp->fd_nfiles || + (fp = fdp->fd_ofiles[fd]) == NULL) + return (EBADF); + switch (fp->f_type) { + + case DTYPE_VNODE: + error = vn_stat((struct vnode *)fp->f_data, &ub, p); + break; + + case DTYPE_SOCKET: + error = soo_stat((struct socket *)fp->f_data, &ub); + break; + + default: + panic("fstat"); + /*NOTREACHED*/ + } + if (error == 0) + error = copyout((caddr_t)&ub, (caddr_t)SCARG(uap, sb), + sizeof (ub)); + return (error); +} + +/* + * Return pathconf information about a file descriptor. + */ +/* ARGSUSED */ +int +fpathconf(p, uap, retval) + struct proc *p; + register struct fpathconf_args /* { + syscallarg(int) fd; + syscallarg(int) name; + } */ *uap; + register_t *retval; +{ + int fd = SCARG(uap, fd); + struct filedesc *fdp = p->p_fd; + struct file *fp; + struct vnode *vp; + + if ((u_int)fd >= fdp->fd_nfiles || + (fp = fdp->fd_ofiles[fd]) == NULL) + return (EBADF); + switch (fp->f_type) { + + case DTYPE_SOCKET: + if (SCARG(uap, name) != _PC_PIPE_BUF) + return (EINVAL); + *retval = PIPE_BUF; + return (0); + + case DTYPE_VNODE: + vp = (struct vnode *)fp->f_data; + return (VOP_PATHCONF(vp, SCARG(uap, name), retval)); + + default: + panic("fpathconf"); + } + /*NOTREACHED*/ +} + +/* + * Allocate a file descriptor for the process. + */ +int fdexpand; + +int +fdalloc(p, want, result) + struct proc *p; + int want; + int *result; +{ + register struct filedesc *fdp = p->p_fd; + register int i; + int lim, last, nfiles; + struct file **newofile; + char *newofileflags; + + /* + * Search for a free descriptor starting at the higher + * of want or fd_freefile. If that fails, consider + * expanding the ofile array. + */ + lim = min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfiles); + for (;;) { + last = min(fdp->fd_nfiles, lim); + if ((i = want) < fdp->fd_freefile) + i = fdp->fd_freefile; + for (; i < last; i++) { + if (fdp->fd_ofiles[i] == NULL) { + fdp->fd_ofileflags[i] = 0; + if (i > fdp->fd_lastfile) + fdp->fd_lastfile = i; + if (want <= fdp->fd_freefile) + fdp->fd_freefile = i; + *result = i; + return (0); + } + } + + /* + * No space in current array. Expand? + */ + if (fdp->fd_nfiles >= lim) + return (EMFILE); + if (fdp->fd_nfiles < NDEXTENT) + nfiles = NDEXTENT; + else + nfiles = 2 * fdp->fd_nfiles; + MALLOC(newofile, struct file **, nfiles * OFILESIZE, + M_FILEDESC, M_WAITOK); + newofileflags = (char *) &newofile[nfiles]; + /* + * Copy the existing ofile and ofileflags arrays + * and zero the new portion of each array. + */ + bcopy(fdp->fd_ofiles, newofile, + (i = sizeof(struct file *) * fdp->fd_nfiles)); + bzero((char *)newofile + i, nfiles * sizeof(struct file *) - i); + bcopy(fdp->fd_ofileflags, newofileflags, + (i = sizeof(char) * fdp->fd_nfiles)); + bzero(newofileflags + i, nfiles * sizeof(char) - i); + if (fdp->fd_nfiles > NDFILE) + FREE(fdp->fd_ofiles, M_FILEDESC); + fdp->fd_ofiles = newofile; + fdp->fd_ofileflags = newofileflags; + fdp->fd_nfiles = nfiles; + fdexpand++; + } +} + +/* + * Check to see whether n user file descriptors + * are available to the process p. + */ +int +fdavail(p, n) + struct proc *p; + register int n; +{ + register struct filedesc *fdp = p->p_fd; + register struct file **fpp; + register int i, lim; + + lim = min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfiles); + if ((i = lim - fdp->fd_nfiles) > 0 && (n -= i) <= 0) + return (1); + fpp = &fdp->fd_ofiles[fdp->fd_freefile]; + for (i = fdp->fd_nfiles - fdp->fd_freefile; --i >= 0; fpp++) + if (*fpp == NULL && --n <= 0) + return (1); + return (0); +} + +/* + * Create a new open file structure and allocate + * a file decriptor for the process that refers to it. + */ +int +falloc(p, resultfp, resultfd) + register struct proc *p; + struct file **resultfp; + int *resultfd; +{ + register struct file *fp, *fq; + int error, i; + + if (error = fdalloc(p, 0, &i)) + return (error); + if (nfiles >= maxfiles) { + tablefull("file"); + return (ENFILE); + } + /* + * Allocate a new file descriptor. + * If the process has file descriptor zero open, add to the list + * of open files at that point, otherwise put it at the front of + * the list of open files. + */ + nfiles++; + MALLOC(fp, struct file *, sizeof(struct file), M_FILE, M_WAITOK); + bzero(fp, sizeof(struct file)); + if (fq = p->p_fd->fd_ofiles[0]) { + LIST_INSERT_AFTER(fq, fp, f_list); + } else { + LIST_INSERT_HEAD(&filehead, fp, f_list); + } + p->p_fd->fd_ofiles[i] = fp; + fp->f_count = 1; + fp->f_cred = p->p_ucred; + crhold(fp->f_cred); + if (resultfp) + *resultfp = fp; + if (resultfd) + *resultfd = i; + return (0); +} + +/* + * Free a file descriptor. + */ +void +ffree(fp) + register struct file *fp; +{ + register struct file *fq; + + LIST_REMOVE(fp, f_list); + crfree(fp->f_cred); +#ifdef DIAGNOSTIC + fp->f_count = 0; +#endif + nfiles--; + FREE(fp, M_FILE); +} + +/* + * Copy a filedesc structure. + */ +struct filedesc * +fdcopy(p) + struct proc *p; +{ + register struct filedesc *newfdp, *fdp = p->p_fd; + register struct file **fpp; + register int i; + + MALLOC(newfdp, struct filedesc *, sizeof(struct filedesc0), + M_FILEDESC, M_WAITOK); + bcopy(fdp, newfdp, sizeof(struct filedesc)); + VREF(newfdp->fd_cdir); + if (newfdp->fd_rdir) + VREF(newfdp->fd_rdir); + newfdp->fd_refcnt = 1; + + /* + * If the number of open files fits in the internal arrays + * of the open file structure, use them, otherwise allocate + * additional memory for the number of descriptors currently + * in use. + */ + if (newfdp->fd_lastfile < NDFILE) { + newfdp->fd_ofiles = ((struct filedesc0 *) newfdp)->fd_dfiles; + newfdp->fd_ofileflags = + ((struct filedesc0 *) newfdp)->fd_dfileflags; + i = NDFILE; + } else { + /* + * Compute the smallest multiple of NDEXTENT needed + * for the file descriptors currently in use, + * allowing the table to shrink. + */ + i = newfdp->fd_nfiles; + while (i > 2 * NDEXTENT && i > newfdp->fd_lastfile * 2) + i /= 2; + MALLOC(newfdp->fd_ofiles, struct file **, i * OFILESIZE, + M_FILEDESC, M_WAITOK); + newfdp->fd_ofileflags = (char *) &newfdp->fd_ofiles[i]; + } + newfdp->fd_nfiles = i; + bcopy(fdp->fd_ofiles, newfdp->fd_ofiles, i * sizeof(struct file **)); + bcopy(fdp->fd_ofileflags, newfdp->fd_ofileflags, i * sizeof(char)); + fpp = newfdp->fd_ofiles; + for (i = newfdp->fd_lastfile; i-- >= 0; fpp++) + if (*fpp != NULL) + (*fpp)->f_count++; + return (newfdp); +} + +/* + * Release a filedesc structure. + */ +void +fdfree(p) + struct proc *p; +{ + register struct filedesc *fdp = p->p_fd; + struct file **fpp; + register int i; + + if (--fdp->fd_refcnt > 0) + return; + fpp = fdp->fd_ofiles; + for (i = fdp->fd_lastfile; i-- >= 0; fpp++) + if (*fpp) + (void) closef(*fpp, p); + if (fdp->fd_nfiles > NDFILE) + FREE(fdp->fd_ofiles, M_FILEDESC); + vrele(fdp->fd_cdir); + if (fdp->fd_rdir) + vrele(fdp->fd_rdir); + FREE(fdp, M_FILEDESC); +} + +/* + * Internal form of close. + * Decrement reference count on file structure. + * Note: p may be NULL when closing a file + * that was being passed in a message. + */ +int +closef(fp, p) + register struct file *fp; + register struct proc *p; +{ + struct vnode *vp; + struct flock lf; + int error; + + if (fp == NULL) + return (0); + /* + * POSIX record locking dictates that any close releases ALL + * locks owned by this process. This is handled by setting + * a flag in the unlock to free ONLY locks obeying POSIX + * semantics, and not to free BSD-style file locks. + * If the descriptor was in a message, POSIX-style locks + * aren't passed with the descriptor. + */ + if (p && (p->p_flag & P_ADVLOCK) && fp->f_type == DTYPE_VNODE) { + lf.l_whence = SEEK_SET; + lf.l_start = 0; + lf.l_len = 0; + lf.l_type = F_UNLCK; + vp = (struct vnode *)fp->f_data; + (void) VOP_ADVLOCK(vp, (caddr_t)p, F_UNLCK, &lf, F_POSIX); + } + if (--fp->f_count > 0) + return (0); + if (fp->f_count < 0) + panic("closef: count < 0"); + if ((fp->f_flag & FHASLOCK) && fp->f_type == DTYPE_VNODE) { + lf.l_whence = SEEK_SET; + lf.l_start = 0; + lf.l_len = 0; + lf.l_type = F_UNLCK; + vp = (struct vnode *)fp->f_data; + (void) VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK); + } + if (fp->f_ops) + error = (*fp->f_ops->fo_close)(fp, p); + else + error = 0; + ffree(fp); + return (error); +} + +/* + * Apply an advisory lock on a file descriptor. + * + * Just attempt to get a record lock of the requested type on + * the entire file (l_whence = SEEK_SET, l_start = 0, l_len = 0). + */ +/* ARGSUSED */ +int +flock(p, uap, retval) + struct proc *p; + register struct flock_args /* { + syscallarg(int) fd; + syscallarg(int) how; + } */ *uap; + register_t *retval; +{ + int fd = SCARG(uap, fd); + int how = SCARG(uap, how); + register struct filedesc *fdp = p->p_fd; + register struct file *fp; + struct vnode *vp; + struct flock lf; + + if ((u_int)fd >= fdp->fd_nfiles || + (fp = fdp->fd_ofiles[fd]) == NULL) + return (EBADF); + if (fp->f_type != DTYPE_VNODE) + return (EOPNOTSUPP); + vp = (struct vnode *)fp->f_data; + lf.l_whence = SEEK_SET; + lf.l_start = 0; + lf.l_len = 0; + if (how & LOCK_UN) { + lf.l_type = F_UNLCK; + fp->f_flag &= ~FHASLOCK; + return (VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK)); + } + if (how & LOCK_EX) + lf.l_type = F_WRLCK; + else if (how & LOCK_SH) + lf.l_type = F_RDLCK; + else + return (EBADF); + fp->f_flag |= FHASLOCK; + if (how & LOCK_NB) + return (VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, F_FLOCK)); + return (VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, F_FLOCK|F_WAIT)); +} + +/* + * File Descriptor pseudo-device driver (/dev/fd/). + * + * Opening minor device N dup()s the file (if any) connected to file + * descriptor N belonging to the calling process. Note that this driver + * consists of only the ``open()'' routine, because all subsequent + * references to this file will be direct to the other driver. + */ +/* ARGSUSED */ +int +fdopen(dev, mode, type, p) + dev_t dev; + int mode, type; + struct proc *p; +{ + + /* + * XXX Kludge: set curproc->p_dupfd to contain the value of the + * the file descriptor being sought for duplication. The error + * return ensures that the vnode for this device will be released + * by vn_open. Open will detect this special error and take the + * actions in dupfdopen below. Other callers of vn_open or VOP_OPEN + * will simply report the error. + */ + p->p_dupfd = minor(dev); + return (ENODEV); +} + +/* + * Duplicate the specified descriptor to a free descriptor. + */ +int +dupfdopen(fdp, indx, dfd, mode, error) + register struct filedesc *fdp; + register int indx, dfd; + int mode; + int error; +{ + register struct file *wfp; + struct file *fp; + + /* + * If the to-be-dup'd fd number is greater than the allowed number + * of file descriptors, or the fd to be dup'd has already been + * closed, reject. Note, check for new == old is necessary as + * falloc could allocate an already closed to-be-dup'd descriptor + * as the new descriptor. + */ + fp = fdp->fd_ofiles[indx]; + if ((u_int)dfd >= fdp->fd_nfiles || + (wfp = fdp->fd_ofiles[dfd]) == NULL || fp == wfp) + return (EBADF); + + /* + * There are two cases of interest here. + * + * For ENODEV simply dup (dfd) to file descriptor + * (indx) and return. + * + * For ENXIO steal away the file structure from (dfd) and + * store it in (indx). (dfd) is effectively closed by + * this operation. + * + * Any other error code is just returned. + */ + switch (error) { + case ENODEV: + /* + * Check that the mode the file is being opened for is a + * subset of the mode of the existing descriptor. + */ + if (((mode & (FREAD|FWRITE)) | wfp->f_flag) != wfp->f_flag) + return (EACCES); + fdp->fd_ofiles[indx] = wfp; + fdp->fd_ofileflags[indx] = fdp->fd_ofileflags[dfd]; + wfp->f_count++; + if (indx > fdp->fd_lastfile) + fdp->fd_lastfile = indx; + return (0); + + case ENXIO: + /* + * Steal away the file pointer from dfd, and stuff it into indx. + */ + fdp->fd_ofiles[indx] = fdp->fd_ofiles[dfd]; + fdp->fd_ofiles[dfd] = NULL; + fdp->fd_ofileflags[indx] = fdp->fd_ofileflags[dfd]; + fdp->fd_ofileflags[dfd] = 0; + /* + * Complete the clean up of the filedesc structure by + * recomputing the various hints. + */ + if (indx > fdp->fd_lastfile) + fdp->fd_lastfile = indx; + else + while (fdp->fd_lastfile > 0 && + fdp->fd_ofiles[fdp->fd_lastfile] == NULL) + fdp->fd_lastfile--; + if (dfd < fdp->fd_freefile) + fdp->fd_freefile = dfd; + return (0); + + default: + return (error); + } + /* NOTREACHED */ +} diff --git a/sys/kern/kern_exit.c b/sys/kern/kern_exit.c new file mode 100644 index 000000000000..4ed48ac9110e --- /dev/null +++ b/sys/kern/kern_exit.c @@ -0,0 +1,453 @@ +/* + * Copyright (c) 1982, 1986, 1989, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)kern_exit.c 8.10 (Berkeley) 2/23/95 + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/map.h> +#include <sys/ioctl.h> +#include <sys/proc.h> +#include <sys/tty.h> +#include <sys/time.h> +#include <sys/resource.h> +#include <sys/kernel.h> +#include <sys/buf.h> +#include <sys/wait.h> +#include <sys/file.h> +#include <sys/vnode.h> +#include <sys/syslog.h> +#include <sys/malloc.h> +#include <sys/resourcevar.h> +#include <sys/ptrace.h> + +#include <machine/cpu.h> +#ifdef COMPAT_43 +#include <machine/reg.h> +#include <machine/psl.h> +#endif + +#include <vm/vm.h> +#include <vm/vm_kern.h> + +__dead void cpu_exit __P((struct proc *)); +__dead void exit1 __P((struct proc *, int)); + +/* + * exit -- + * Death of process. + */ +struct rexit_args { + int rval; +}; +__dead void +exit(p, uap, retval) + struct proc *p; + struct rexit_args *uap; + int *retval; +{ + + exit1(p, W_EXITCODE(uap->rval, 0)); + /* NOTREACHED */ +} + +/* + * Exit: deallocate address space and other resources, change proc state + * to zombie, and unlink proc from allproc and parent's lists. Save exit + * status and rusage for wait(). Check for child processes and orphan them. + */ +__dead void +exit1(p, rv) + register struct proc *p; + int rv; +{ + register struct proc *q, *nq; + register struct proc **pp; + register struct vmspace *vm; + + if (p->p_pid == 1) + panic("init died (signal %d, exit %d)", + WTERMSIG(rv), WEXITSTATUS(rv)); +#ifdef PGINPROF + vmsizmon(); +#endif + if (p->p_flag & P_PROFIL) + stopprofclock(p); + MALLOC(p->p_ru, struct rusage *, sizeof(struct rusage), + M_ZOMBIE, M_WAITOK); + /* + * If parent is waiting for us to exit or exec, + * P_PPWAIT is set; we will wakeup the parent below. + */ + p->p_flag &= ~(P_TRACED | P_PPWAIT); + p->p_flag |= P_WEXIT; + p->p_sigignore = ~0; + p->p_siglist = 0; + untimeout(realitexpire, (caddr_t)p); + + /* + * Close open files and release open-file table. + * This may block! + */ + fdfree(p); + + /* The next two chunks should probably be moved to vmspace_exit. */ + vm = p->p_vmspace; +#ifdef SYSVSHM + if (vm->vm_shm) + shmexit(p); +#endif + /* + * Release user portion of address space. + * This releases references to vnodes, + * which could cause I/O if the file has been unlinked. + * Need to do this early enough that we can still sleep. + * Can't free the entire vmspace as the kernel stack + * may be mapped within that space also. + */ + if (vm->vm_refcnt == 1) + (void) vm_map_remove(&vm->vm_map, VM_MIN_ADDRESS, + VM_MAXUSER_ADDRESS); + + if (SESS_LEADER(p)) { + register struct session *sp = p->p_session; + + if (sp->s_ttyvp) { + /* + * Controlling process. + * Signal foreground pgrp, + * drain controlling terminal + * and revoke access to controlling terminal. + */ + if (sp->s_ttyp->t_session == sp) { + if (sp->s_ttyp->t_pgrp) + pgsignal(sp->s_ttyp->t_pgrp, SIGHUP, 1); + (void) ttywait(sp->s_ttyp); + /* + * The tty could have been revoked + * if we blocked. + */ + if (sp->s_ttyvp) + VOP_REVOKE(sp->s_ttyvp, REVOKEALL); + } + if (sp->s_ttyvp) + vrele(sp->s_ttyvp); + sp->s_ttyvp = NULL; + /* + * s_ttyp is not zero'd; we use this to indicate + * that the session once had a controlling terminal. + * (for logging and informational purposes) + */ + } + sp->s_leader = NULL; + } + fixjobc(p, p->p_pgrp, 0); + p->p_rlimit[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY; + (void)acct_process(p); +#ifdef KTRACE + /* + * release trace file + */ + p->p_traceflag = 0; /* don't trace the vrele() */ + if (p->p_tracep) + vrele(p->p_tracep); +#endif + /* + * Remove proc from allproc queue and pidhash chain. + * Place onto zombproc. Unlink from parent's child list. + */ + LIST_REMOVE(p, p_list); + LIST_INSERT_HEAD(&zombproc, p, p_list); + p->p_stat = SZOMB; + + LIST_REMOVE(p, p_hash); + + q = p->p_children.lh_first; + if (q) /* only need this if any child is S_ZOMB */ + wakeup((caddr_t) initproc); + for (; q != 0; q = nq) { + nq = q->p_sibling.le_next; + LIST_REMOVE(q, p_sibling); + LIST_INSERT_HEAD(&initproc->p_children, q, p_sibling); + q->p_pptr = initproc; + /* + * Traced processes are killed + * since their existence means someone is screwing up. + */ + if (q->p_flag & P_TRACED) { + q->p_flag &= ~P_TRACED; + psignal(q, SIGKILL); + } + } + + /* + * Save exit status and final rusage info, adding in child rusage + * info and self times. + */ + p->p_xstat = rv; + *p->p_ru = p->p_stats->p_ru; + calcru(p, &p->p_ru->ru_utime, &p->p_ru->ru_stime, NULL); + ruadd(p->p_ru, &p->p_stats->p_cru); + + /* + * Notify parent that we're gone. + */ + psignal(p->p_pptr, SIGCHLD); + wakeup((caddr_t)p->p_pptr); +#if defined(tahoe) + /* move this to cpu_exit */ + p->p_addr->u_pcb.pcb_savacc.faddr = (float *)NULL; +#endif + /* + * Clear curproc after we've done all operations + * that could block, and before tearing down the rest + * of the process state that might be used from clock, etc. + * Also, can't clear curproc while we're still runnable, + * as we're not on a run queue (we are current, just not + * a proper proc any longer!). + * + * Other substructures are freed from wait(). + */ + curproc = NULL; + if (--p->p_limit->p_refcnt == 0) + FREE(p->p_limit, M_SUBPROC); + + /* + * Finally, call machine-dependent code to release the remaining + * resources including address space, the kernel stack and pcb. + * The address space is released by "vmspace_free(p->p_vmspace)"; + * This is machine-dependent, as we may have to change stacks + * or ensure that the current one isn't reallocated before we + * finish. cpu_exit will end with a call to cpu_swtch(), finishing + * our execution (pun intended). + */ + cpu_exit(p); +} + +struct wait_args { + int pid; + int *status; + int options; + struct rusage *rusage; +#ifdef COMPAT_43 + int compat; /* pseudo */ +#endif +}; + +#ifdef COMPAT_43 +#if defined(hp300) || defined(luna68k) +#include <machine/frame.h> +#define GETPS(rp) ((struct frame *)(rp))->f_sr +#else +#define GETPS(rp) (rp)[PS] +#endif + +compat_43_wait(p, uap, retval) + struct proc *p; + register struct wait_args *uap; + int *retval; +{ + +#ifdef PSL_ALLCC + if ((GETPS(p->p_md.md_regs) & PSL_ALLCC) != PSL_ALLCC) { + uap->options = 0; + uap->rusage = NULL; + } else { + uap->options = p->p_md.md_regs[R0]; + uap->rusage = (struct rusage *)p->p_md.md_regs[R1]; + } +#else + uap->options = 0; + uap->rusage = NULL; +#endif + uap->pid = WAIT_ANY; + uap->status = NULL; + uap->compat = 1; + return (wait1(p, uap, retval)); +} + +wait4(p, uap, retval) + struct proc *p; + struct wait_args *uap; + int *retval; +{ + + uap->compat = 0; + return (wait1(p, uap, retval)); +} +#else +#define wait1 wait4 +#endif + +int +wait1(q, uap, retval) + register struct proc *q; + register struct wait_args *uap; + int retval[]; +{ + register int nfound; + register struct proc *p, *t; + int status, error; + + if (uap->pid == 0) + uap->pid = -q->p_pgid; +#ifdef notyet + if (uap->options &~ (WUNTRACED|WNOHANG)) + return (EINVAL); +#endif +loop: + nfound = 0; + for (p = q->p_children.lh_first; p != 0; p = p->p_sibling.le_next) { + if (uap->pid != WAIT_ANY && + p->p_pid != uap->pid && p->p_pgid != -uap->pid) + continue; + nfound++; + if (p->p_stat == SZOMB) { + retval[0] = p->p_pid; +#ifdef COMPAT_43 + if (uap->compat) + retval[1] = p->p_xstat; + else +#endif + if (uap->status) { + status = p->p_xstat; /* convert to int */ + if (error = copyout((caddr_t)&status, + (caddr_t)uap->status, sizeof(status))) + return (error); + } + if (uap->rusage && (error = copyout((caddr_t)p->p_ru, + (caddr_t)uap->rusage, sizeof (struct rusage)))) + return (error); + /* + * If we got the child via a ptrace 'attach', + * we need to give it back to the old parent. + */ + if (p->p_oppid && (t = pfind(p->p_oppid))) { + p->p_oppid = 0; + proc_reparent(p, t); + psignal(t, SIGCHLD); + wakeup((caddr_t)t); + return (0); + } + p->p_xstat = 0; + ruadd(&q->p_stats->p_cru, p->p_ru); + FREE(p->p_ru, M_ZOMBIE); + + /* + * Decrement the count of procs running with this uid. + */ + (void)chgproccnt(p->p_cred->p_ruid, -1); + + /* + * Free up credentials. + */ + if (--p->p_cred->p_refcnt == 0) { + crfree(p->p_cred->pc_ucred); + FREE(p->p_cred, M_SUBPROC); + } + + /* + * Release reference to text vnode + */ + if (p->p_textvp) + vrele(p->p_textvp); + + /* + * Finally finished with old proc entry. + * Unlink it from its process group and free it. + */ + leavepgrp(p); + LIST_REMOVE(p, p_list); /* off zombproc */ + LIST_REMOVE(p, p_sibling); + + /* + * Give machine-dependent layer a chance + * to free anything that cpu_exit couldn't + * release while still running in process context. + */ + cpu_wait(p); + FREE(p, M_PROC); + nprocs--; + return (0); + } + if (p->p_stat == SSTOP && (p->p_flag & P_WAITED) == 0 && + (p->p_flag & P_TRACED || uap->options & WUNTRACED)) { + p->p_flag |= P_WAITED; + retval[0] = p->p_pid; +#ifdef COMPAT_43 + if (uap->compat) { + retval[1] = W_STOPCODE(p->p_xstat); + error = 0; + } else +#endif + if (uap->status) { + status = W_STOPCODE(p->p_xstat); + error = copyout((caddr_t)&status, + (caddr_t)uap->status, sizeof(status)); + } else + error = 0; + return (error); + } + } + if (nfound == 0) + return (ECHILD); + if (uap->options & WNOHANG) { + retval[0] = 0; + return (0); + } + if (error = tsleep((caddr_t)q, PWAIT | PCATCH, "wait", 0)) + return (error); + goto loop; +} + +/* + * make process 'parent' the new parent of process 'child'. + */ +void +proc_reparent(child, parent) + register struct proc *child; + register struct proc *parent; +{ + + if (child->p_pptr == parent) + return; + + LIST_REMOVE(child, p_sibling); + LIST_INSERT_HEAD(&parent->p_children, child, p_sibling); + child->p_pptr = parent; +} diff --git a/sys/kern/kern_fork.c b/sys/kern/kern_fork.c new file mode 100644 index 000000000000..6c5f22f0d037 --- /dev/null +++ b/sys/kern/kern_fork.c @@ -0,0 +1,287 @@ +/* + * Copyright (c) 1982, 1986, 1989, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)kern_fork.c 8.8 (Berkeley) 2/14/95 + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/map.h> +#include <sys/filedesc.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/proc.h> +#include <sys/resourcevar.h> +#include <sys/vnode.h> +#include <sys/file.h> +#include <sys/acct.h> +#include <sys/ktrace.h> + +/* ARGSUSED */ +fork(p, uap, retval) + struct proc *p; + void *uap; + register_t *retval; +{ + + return (fork1(p, 0, retval)); +} + +/* ARGSUSED */ +vfork(p, uap, retval) + struct proc *p; + void *uap; + register_t *retval; +{ + + return (fork1(p, 1, retval)); +} + +int nprocs = 1; /* process 0 */ + +fork1(p1, isvfork, retval) + register struct proc *p1; + int isvfork; + register_t *retval; +{ + register struct proc *p2; + register uid_t uid; + struct proc *newproc; + struct proc **hash; + int count; + static int nextpid, pidchecked = 0; + + /* + * Although process entries are dynamically created, we still keep + * a global limit on the maximum number we will create. Don't allow + * a nonprivileged user to use the last process; don't let root + * exceed the limit. The variable nprocs is the current number of + * processes, maxproc is the limit. + */ + uid = p1->p_cred->p_ruid; + if ((nprocs >= maxproc - 1 && uid != 0) || nprocs >= maxproc) { + tablefull("proc"); + return (EAGAIN); + } + + /* + * Increment the count of procs running with this uid. Don't allow + * a nonprivileged user to exceed their current limit. + */ + count = chgproccnt(uid, 1); + if (uid != 0 && count > p1->p_rlimit[RLIMIT_NPROC].rlim_cur) { + (void)chgproccnt(uid, -1); + return (EAGAIN); + } + + /* Allocate new proc. */ + MALLOC(newproc, struct proc *, sizeof(struct proc), M_PROC, M_WAITOK); + + /* + * Find an unused process ID. We remember a range of unused IDs + * ready to use (from nextpid+1 through pidchecked-1). + */ + nextpid++; +retry: + /* + * If the process ID prototype has wrapped around, + * restart somewhat above 0, as the low-numbered procs + * tend to include daemons that don't exit. + */ + if (nextpid >= PID_MAX) { + nextpid = 100; + pidchecked = 0; + } + if (nextpid >= pidchecked) { + int doingzomb = 0; + + pidchecked = PID_MAX; + /* + * Scan the active and zombie procs to check whether this pid + * is in use. Remember the lowest pid that's greater + * than nextpid, so we can avoid checking for a while. + */ + p2 = allproc.lh_first; +again: + for (; p2 != 0; p2 = p2->p_list.le_next) { + while (p2->p_pid == nextpid || + p2->p_pgrp->pg_id == nextpid) { + nextpid++; + if (nextpid >= pidchecked) + goto retry; + } + if (p2->p_pid > nextpid && pidchecked > p2->p_pid) + pidchecked = p2->p_pid; + if (p2->p_pgrp->pg_id > nextpid && + pidchecked > p2->p_pgrp->pg_id) + pidchecked = p2->p_pgrp->pg_id; + } + if (!doingzomb) { + doingzomb = 1; + p2 = zombproc.lh_first; + goto again; + } + } + + nprocs++; + p2 = newproc; + p2->p_stat = SIDL; /* protect against others */ + p2->p_pid = nextpid; + LIST_INSERT_HEAD(&allproc, p2, p_list); + p2->p_forw = p2->p_back = NULL; /* shouldn't be necessary */ + LIST_INSERT_HEAD(PIDHASH(p2->p_pid), p2, p_hash); + + /* + * Make a proc table entry for the new process. + * Start by zeroing the section of proc that is zero-initialized, + * then copy the section that is copied directly from the parent. + */ + bzero(&p2->p_startzero, + (unsigned) ((caddr_t)&p2->p_endzero - (caddr_t)&p2->p_startzero)); + bcopy(&p1->p_startcopy, &p2->p_startcopy, + (unsigned) ((caddr_t)&p2->p_endcopy - (caddr_t)&p2->p_startcopy)); + + /* + * Duplicate sub-structures as needed. + * Increase reference counts on shared objects. + * The p_stats and p_sigacts substructs are set in vm_fork. + */ + p2->p_flag = P_INMEM; + if (p1->p_flag & P_PROFIL) + startprofclock(p2); + MALLOC(p2->p_cred, struct pcred *, sizeof(struct pcred), + M_SUBPROC, M_WAITOK); + bcopy(p1->p_cred, p2->p_cred, sizeof(*p2->p_cred)); + p2->p_cred->p_refcnt = 1; + crhold(p1->p_ucred); + + /* bump references to the text vnode (for procfs) */ + p2->p_textvp = p1->p_textvp; + if (p2->p_textvp) + VREF(p2->p_textvp); + + p2->p_fd = fdcopy(p1); + /* + * If p_limit is still copy-on-write, bump refcnt, + * otherwise get a copy that won't be modified. + * (If PL_SHAREMOD is clear, the structure is shared + * copy-on-write.) + */ + if (p1->p_limit->p_lflags & PL_SHAREMOD) + p2->p_limit = limcopy(p1->p_limit); + else { + p2->p_limit = p1->p_limit; + p2->p_limit->p_refcnt++; + } + + if (p1->p_session->s_ttyvp != NULL && p1->p_flag & P_CONTROLT) + p2->p_flag |= P_CONTROLT; + if (isvfork) + p2->p_flag |= P_PPWAIT; + LIST_INSERT_AFTER(p1, p2, p_pglist); + p2->p_pptr = p1; + LIST_INSERT_HEAD(&p1->p_children, p2, p_sibling); + LIST_INIT(&p2->p_children); + +#ifdef KTRACE + /* + * Copy traceflag and tracefile if enabled. + * If not inherited, these were zeroed above. + */ + if (p1->p_traceflag&KTRFAC_INHERIT) { + p2->p_traceflag = p1->p_traceflag; + if ((p2->p_tracep = p1->p_tracep) != NULL) + VREF(p2->p_tracep); + } +#endif + + /* + * This begins the section where we must prevent the parent + * from being swapped. + */ + p1->p_flag |= P_NOSWAP; + /* + * Set return values for child before vm_fork, + * so they can be copied to child stack. + * We return parent pid, and mark as child in retval[1]. + * NOTE: the kernel stack may be at a different location in the child + * process, and thus addresses of automatic variables (including retval) + * may be invalid after vm_fork returns in the child process. + */ + retval[0] = p1->p_pid; + retval[1] = 1; + if (vm_fork(p1, p2, isvfork)) { + /* + * Child process. Set start time and get to work. + */ + (void) splclock(); + p2->p_stats->p_start = time; + (void) spl0(); + p2->p_acflag = AFORK; + return (0); + } + + /* + * Make child runnable and add to run queue. + */ + (void) splhigh(); + p2->p_stat = SRUN; + setrunqueue(p2); + (void) spl0(); + + /* + * Now can be swapped. + */ + p1->p_flag &= ~P_NOSWAP; + + /* + * Preserve synchronization semantics of vfork. If waiting for + * child to exec or exit, set P_PPWAIT on child, and sleep on our + * proc (in case of exit). + */ + if (isvfork) + while (p2->p_flag & P_PPWAIT) + tsleep(p1, PWAIT, "ppwait", 0); + + /* + * Return child pid to parent process, + * marking us as parent via retval[1]. + */ + retval[0] = p2->p_pid; + retval[1] = 0; + return (0); +} diff --git a/sys/kern/kern_ktrace.c b/sys/kern/kern_ktrace.c new file mode 100644 index 000000000000..b84175439a85 --- /dev/null +++ b/sys/kern/kern_ktrace.c @@ -0,0 +1,475 @@ +/* + * Copyright (c) 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)kern_ktrace.c 8.5 (Berkeley) 5/14/95 + */ + +#ifdef KTRACE + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/proc.h> +#include <sys/file.h> +#include <sys/namei.h> +#include <sys/vnode.h> +#include <sys/ktrace.h> +#include <sys/malloc.h> +#include <sys/syslog.h> + +#include <sys/mount.h> +#include <sys/syscallargs.h> + +struct ktr_header * +ktrgetheader(type) + int type; +{ + register struct ktr_header *kth; + struct proc *p = curproc; /* XXX */ + + MALLOC(kth, struct ktr_header *, sizeof (struct ktr_header), + M_TEMP, M_WAITOK); + kth->ktr_type = type; + microtime(&kth->ktr_time); + kth->ktr_pid = p->p_pid; + bcopy(p->p_comm, kth->ktr_comm, MAXCOMLEN); + return (kth); +} + +void +ktrsyscall(vp, code, argsize, args) + struct vnode *vp; + int code, argsize; + register_t args[]; +{ + struct ktr_header *kth; + struct ktr_syscall *ktp; + register len = sizeof(struct ktr_syscall) + argsize; + struct proc *p = curproc; /* XXX */ + register_t *argp; + int i; + + p->p_traceflag |= KTRFAC_ACTIVE; + kth = ktrgetheader(KTR_SYSCALL); + MALLOC(ktp, struct ktr_syscall *, len, M_TEMP, M_WAITOK); + ktp->ktr_code = code; + ktp->ktr_argsize = argsize; + argp = (register_t *)((char *)ktp + sizeof(struct ktr_syscall)); + for (i = 0; i < (argsize / sizeof *argp); i++) + *argp++ = args[i]; + kth->ktr_buf = (caddr_t)ktp; + kth->ktr_len = len; + ktrwrite(vp, kth); + FREE(ktp, M_TEMP); + FREE(kth, M_TEMP); + p->p_traceflag &= ~KTRFAC_ACTIVE; +} + +void +ktrsysret(vp, code, error, retval) + struct vnode *vp; + int code, error, retval; +{ + struct ktr_header *kth; + struct ktr_sysret ktp; + struct proc *p = curproc; /* XXX */ + + p->p_traceflag |= KTRFAC_ACTIVE; + kth = ktrgetheader(KTR_SYSRET); + ktp.ktr_code = code; + ktp.ktr_error = error; + ktp.ktr_retval = retval; /* what about val2 ? */ + + kth->ktr_buf = (caddr_t)&ktp; + kth->ktr_len = sizeof(struct ktr_sysret); + + ktrwrite(vp, kth); + FREE(kth, M_TEMP); + p->p_traceflag &= ~KTRFAC_ACTIVE; +} + +void +ktrnamei(vp, path) + struct vnode *vp; + char *path; +{ + struct ktr_header *kth; + struct proc *p = curproc; /* XXX */ + + p->p_traceflag |= KTRFAC_ACTIVE; + kth = ktrgetheader(KTR_NAMEI); + kth->ktr_len = strlen(path); + kth->ktr_buf = path; + + ktrwrite(vp, kth); + FREE(kth, M_TEMP); + p->p_traceflag &= ~KTRFAC_ACTIVE; +} + +void +ktrgenio(vp, fd, rw, iov, len, error) + struct vnode *vp; + int fd; + enum uio_rw rw; + register struct iovec *iov; + int len, error; +{ + struct ktr_header *kth; + register struct ktr_genio *ktp; + register caddr_t cp; + register int resid = len, cnt; + struct proc *p = curproc; /* XXX */ + + if (error) + return; + p->p_traceflag |= KTRFAC_ACTIVE; + kth = ktrgetheader(KTR_GENIO); + MALLOC(ktp, struct ktr_genio *, sizeof(struct ktr_genio) + len, + M_TEMP, M_WAITOK); + ktp->ktr_fd = fd; + ktp->ktr_rw = rw; + cp = (caddr_t)((char *)ktp + sizeof (struct ktr_genio)); + while (resid > 0) { + if ((cnt = iov->iov_len) > resid) + cnt = resid; + if (copyin(iov->iov_base, cp, (unsigned)cnt)) + goto done; + cp += cnt; + resid -= cnt; + iov++; + } + kth->ktr_buf = (caddr_t)ktp; + kth->ktr_len = sizeof (struct ktr_genio) + len; + + ktrwrite(vp, kth); +done: + FREE(kth, M_TEMP); + FREE(ktp, M_TEMP); + p->p_traceflag &= ~KTRFAC_ACTIVE; +} + +void +ktrpsig(vp, sig, action, mask, code) + struct vnode *vp; + int sig; + sig_t action; + int mask, code; +{ + struct ktr_header *kth; + struct ktr_psig kp; + struct proc *p = curproc; /* XXX */ + + p->p_traceflag |= KTRFAC_ACTIVE; + kth = ktrgetheader(KTR_PSIG); + kp.signo = (char)sig; + kp.action = action; + kp.mask = mask; + kp.code = code; + kth->ktr_buf = (caddr_t)&kp; + kth->ktr_len = sizeof (struct ktr_psig); + + ktrwrite(vp, kth); + FREE(kth, M_TEMP); + p->p_traceflag &= ~KTRFAC_ACTIVE; +} + +void +ktrcsw(vp, out, user) + struct vnode *vp; + int out, user; +{ + struct ktr_header *kth; + struct ktr_csw kc; + struct proc *p = curproc; /* XXX */ + + p->p_traceflag |= KTRFAC_ACTIVE; + kth = ktrgetheader(KTR_CSW); + kc.out = out; + kc.user = user; + kth->ktr_buf = (caddr_t)&kc; + kth->ktr_len = sizeof (struct ktr_csw); + + ktrwrite(vp, kth); + FREE(kth, M_TEMP); + p->p_traceflag &= ~KTRFAC_ACTIVE; +} + +/* Interface and common routines */ + +/* + * ktrace system call + */ +/* ARGSUSED */ +int +ktrace(curp, uap, retval) + struct proc *curp; + register struct ktrace_args /* { + syscallarg(char *) fname; + syscallarg(int) ops; + syscallarg(int) facs; + syscallarg(int) pid; + } */ *uap; + register_t *retval; +{ + register struct vnode *vp = NULL; + register struct proc *p; + struct pgrp *pg; + int facs = SCARG(uap, facs) & ~KTRFAC_ROOT; + int ops = KTROP(SCARG(uap, ops)); + int descend = SCARG(uap, ops) & KTRFLAG_DESCEND; + int ret = 0; + int error = 0; + struct nameidata nd; + + curp->p_traceflag |= KTRFAC_ACTIVE; + if (ops != KTROP_CLEAR) { + /* + * an operation which requires a file argument. + */ + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, fname), + curp); + if (error = vn_open(&nd, FREAD|FWRITE, 0)) { + curp->p_traceflag &= ~KTRFAC_ACTIVE; + return (error); + } + vp = nd.ni_vp; + VOP_UNLOCK(vp, 0, p); + if (vp->v_type != VREG) { + (void) vn_close(vp, FREAD|FWRITE, curp->p_ucred, curp); + curp->p_traceflag &= ~KTRFAC_ACTIVE; + return (EACCES); + } + } + /* + * Clear all uses of the tracefile + */ + if (ops == KTROP_CLEARFILE) { + for (p = allproc.lh_first; p != 0; p = p->p_list.le_next) { + if (p->p_tracep == vp) { + if (ktrcanset(curp, p)) { + p->p_tracep = NULL; + p->p_traceflag = 0; + (void) vn_close(vp, FREAD|FWRITE, + p->p_ucred, p); + } else + error = EPERM; + } + } + goto done; + } + /* + * need something to (un)trace (XXX - why is this here?) + */ + if (!facs) { + error = EINVAL; + goto done; + } + /* + * do it + */ + if (SCARG(uap, pid) < 0) { + /* + * by process group + */ + pg = pgfind(-SCARG(uap, pid)); + if (pg == NULL) { + error = ESRCH; + goto done; + } + for (p = pg->pg_members.lh_first; p != 0; p = p->p_pglist.le_next) + if (descend) + ret |= ktrsetchildren(curp, p, ops, facs, vp); + else + ret |= ktrops(curp, p, ops, facs, vp); + + } else { + /* + * by pid + */ + p = pfind(SCARG(uap, pid)); + if (p == NULL) { + error = ESRCH; + goto done; + } + if (descend) + ret |= ktrsetchildren(curp, p, ops, facs, vp); + else + ret |= ktrops(curp, p, ops, facs, vp); + } + if (!ret) + error = EPERM; +done: + if (vp != NULL) + (void) vn_close(vp, FWRITE, curp->p_ucred, curp); + curp->p_traceflag &= ~KTRFAC_ACTIVE; + return (error); +} + +int +ktrops(curp, p, ops, facs, vp) + struct proc *p, *curp; + int ops, facs; + struct vnode *vp; +{ + + if (!ktrcanset(curp, p)) + return (0); + if (ops == KTROP_SET) { + if (p->p_tracep != vp) { + /* + * if trace file already in use, relinquish + */ + if (p->p_tracep != NULL) + vrele(p->p_tracep); + VREF(vp); + p->p_tracep = vp; + } + p->p_traceflag |= facs; + if (curp->p_ucred->cr_uid == 0) + p->p_traceflag |= KTRFAC_ROOT; + } else { + /* KTROP_CLEAR */ + if (((p->p_traceflag &= ~facs) & KTRFAC_MASK) == 0) { + /* no more tracing */ + p->p_traceflag = 0; + if (p->p_tracep != NULL) { + vrele(p->p_tracep); + p->p_tracep = NULL; + } + } + } + + return (1); +} + +ktrsetchildren(curp, top, ops, facs, vp) + struct proc *curp, *top; + int ops, facs; + struct vnode *vp; +{ + register struct proc *p; + register int ret = 0; + + p = top; + for (;;) { + ret |= ktrops(curp, p, ops, facs, vp); + /* + * If this process has children, descend to them next, + * otherwise do any siblings, and if done with this level, + * follow back up the tree (but not past top). + */ + if (p->p_children.lh_first) + p = p->p_children.lh_first; + else for (;;) { + if (p == top) + return (ret); + if (p->p_sibling.le_next) { + p = p->p_sibling.le_next; + break; + } + p = p->p_pptr; + } + } + /*NOTREACHED*/ +} + +ktrwrite(vp, kth) + struct vnode *vp; + register struct ktr_header *kth; +{ + struct uio auio; + struct iovec aiov[2]; + register struct proc *p = curproc; /* XXX */ + int error; + + if (vp == NULL) + return; + auio.uio_iov = &aiov[0]; + auio.uio_offset = 0; + auio.uio_segflg = UIO_SYSSPACE; + auio.uio_rw = UIO_WRITE; + aiov[0].iov_base = (caddr_t)kth; + aiov[0].iov_len = sizeof(struct ktr_header); + auio.uio_resid = sizeof(struct ktr_header); + auio.uio_iovcnt = 1; + auio.uio_procp = (struct proc *)0; + if (kth->ktr_len > 0) { + auio.uio_iovcnt++; + aiov[1].iov_base = kth->ktr_buf; + aiov[1].iov_len = kth->ktr_len; + auio.uio_resid += kth->ktr_len; + } + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); + error = VOP_WRITE(vp, &auio, IO_UNIT|IO_APPEND, p->p_ucred); + VOP_UNLOCK(vp, 0, p); + if (!error) + return; + /* + * If error encountered, give up tracing on this vnode. + */ + log(LOG_NOTICE, "ktrace write failed, errno %d, tracing stopped\n", + error); + for (p = allproc.lh_first; p != 0; p = p->p_list.le_next) { + if (p->p_tracep == vp) { + p->p_tracep = NULL; + p->p_traceflag = 0; + vrele(vp); + } + } +} + +/* + * Return true if caller has permission to set the ktracing state + * of target. Essentially, the target can't possess any + * more permissions than the caller. KTRFAC_ROOT signifies that + * root previously set the tracing status on the target process, and + * so, only root may further change it. + * + * TODO: check groups. use caller effective gid. + */ +ktrcanset(callp, targetp) + struct proc *callp, *targetp; +{ + register struct pcred *caller = callp->p_cred; + register struct pcred *target = targetp->p_cred; + + if ((caller->pc_ucred->cr_uid == target->p_ruid && + target->p_ruid == target->p_svuid && + caller->p_rgid == target->p_rgid && /* XXX */ + target->p_rgid == target->p_svgid && + (targetp->p_traceflag & KTRFAC_ROOT) == 0) || + caller->pc_ucred->cr_uid == 0) + return (1); + + return (0); +} + +#endif diff --git a/sys/kern/kern_malloc.c b/sys/kern/kern_malloc.c new file mode 100644 index 000000000000..363cde5d682f --- /dev/null +++ b/sys/kern/kern_malloc.c @@ -0,0 +1,396 @@ +/* + * Copyright (c) 1987, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)kern_malloc.c 8.4 (Berkeley) 5/20/95 + */ + +#include <sys/param.h> +#include <sys/proc.h> +#include <sys/map.h> +#include <sys/kernel.h> +#include <sys/malloc.h> + +#include <vm/vm.h> +#include <vm/vm_kern.h> + +struct kmembuckets bucket[MINBUCKET + 16]; +struct kmemstats kmemstats[M_LAST]; +struct kmemusage *kmemusage; +char *kmembase, *kmemlimit; +char *memname[] = INITKMEMNAMES; + +#ifdef DIAGNOSTIC +/* + * This structure provides a set of masks to catch unaligned frees. + */ +long addrmask[] = { 0, + 0x00000001, 0x00000003, 0x00000007, 0x0000000f, + 0x0000001f, 0x0000003f, 0x0000007f, 0x000000ff, + 0x000001ff, 0x000003ff, 0x000007ff, 0x00000fff, + 0x00001fff, 0x00003fff, 0x00007fff, 0x0000ffff, +}; + +/* + * The WEIRD_ADDR is used as known text to copy into free objects so + * that modifications after frees can be detected. + */ +#define WEIRD_ADDR 0xdeadbeef +#define MAX_COPY 32 + +/* + * Normally the first word of the structure is used to hold the list + * pointer for free objects. However, when running with diagnostics, + * we use the third and fourth fields, so as to catch modifications + * in the most commonly trashed first two words. + */ +struct freelist { + long spare0; + short type; + long spare1; + caddr_t next; +}; +#else /* !DIAGNOSTIC */ +struct freelist { + caddr_t next; +}; +#endif /* DIAGNOSTIC */ + +/* + * Allocate a block of memory + */ +void * +malloc(size, type, flags) + unsigned long size; + int type, flags; +{ + register struct kmembuckets *kbp; + register struct kmemusage *kup; + register struct freelist *freep; + long indx, npg, allocsize; + int s; + caddr_t va, cp, savedlist; +#ifdef DIAGNOSTIC + long *end, *lp; + int copysize; + char *savedtype; +#endif +#ifdef DEBUG + extern int simplelockrecurse; +#endif +#ifdef KMEMSTATS + register struct kmemstats *ksp = &kmemstats[type]; + + if (((unsigned long)type) > M_LAST) + panic("malloc - bogus type"); +#endif + indx = BUCKETINDX(size); + kbp = &bucket[indx]; + s = splimp(); +#ifdef KMEMSTATS + while (ksp->ks_memuse >= ksp->ks_limit) { + if (flags & M_NOWAIT) { + splx(s); + return ((void *) NULL); + } + if (ksp->ks_limblocks < 65535) + ksp->ks_limblocks++; + tsleep((caddr_t)ksp, PSWP+2, memname[type], 0); + } + ksp->ks_size |= 1 << indx; +#endif +#ifdef DIAGNOSTIC + copysize = 1 << indx < MAX_COPY ? 1 << indx : MAX_COPY; +#endif +#ifdef DEBUG + if (flags & M_NOWAIT) + simplelockrecurse++; +#endif + if (kbp->kb_next == NULL) { + kbp->kb_last = NULL; + if (size > MAXALLOCSAVE) + allocsize = roundup(size, CLBYTES); + else + allocsize = 1 << indx; + npg = clrnd(btoc(allocsize)); + va = (caddr_t) kmem_malloc(kmem_map, (vm_size_t)ctob(npg), + !(flags & M_NOWAIT)); + if (va == NULL) { + splx(s); +#ifdef DEBUG + if (flags & M_NOWAIT) + simplelockrecurse--; +#endif + return ((void *) NULL); + } +#ifdef KMEMSTATS + kbp->kb_total += kbp->kb_elmpercl; +#endif + kup = btokup(va); + kup->ku_indx = indx; + if (allocsize > MAXALLOCSAVE) { + if (npg > 65535) + panic("malloc: allocation too large"); + kup->ku_pagecnt = npg; +#ifdef KMEMSTATS + ksp->ks_memuse += allocsize; +#endif + goto out; + } +#ifdef KMEMSTATS + kup->ku_freecnt = kbp->kb_elmpercl; + kbp->kb_totalfree += kbp->kb_elmpercl; +#endif + /* + * Just in case we blocked while allocating memory, + * and someone else also allocated memory for this + * bucket, don't assume the list is still empty. + */ + savedlist = kbp->kb_next; + kbp->kb_next = cp = va + (npg * NBPG) - allocsize; + for (;;) { + freep = (struct freelist *)cp; +#ifdef DIAGNOSTIC + /* + * Copy in known text to detect modification + * after freeing. + */ + end = (long *)&cp[copysize]; + for (lp = (long *)cp; lp < end; lp++) + *lp = WEIRD_ADDR; + freep->type = M_FREE; +#endif /* DIAGNOSTIC */ + if (cp <= va) + break; + cp -= allocsize; + freep->next = cp; + } + freep->next = savedlist; + if (kbp->kb_last == NULL) + kbp->kb_last = (caddr_t)freep; + } + va = kbp->kb_next; + kbp->kb_next = ((struct freelist *)va)->next; +#ifdef DIAGNOSTIC + freep = (struct freelist *)va; + savedtype = (unsigned)freep->type < M_LAST ? + memname[freep->type] : "???"; + if (kbp->kb_next && + !kernacc(kbp->kb_next, sizeof(struct freelist), 0)) { + printf("%s of object 0x%x size %d %s %s (invalid addr 0x%x)\n", + "Data modified on freelist: word 2.5", va, size, + "previous type", savedtype, kbp->kb_next); + kbp->kb_next = NULL; + } +#if BYTE_ORDER == BIG_ENDIAN + freep->type = WEIRD_ADDR >> 16; +#endif +#if BYTE_ORDER == LITTLE_ENDIAN + freep->type = (short)WEIRD_ADDR; +#endif + if (((long)(&freep->next)) & 0x2) + freep->next = (caddr_t)((WEIRD_ADDR >> 16)|(WEIRD_ADDR << 16)); + else + freep->next = (caddr_t)WEIRD_ADDR; + end = (long *)&va[copysize]; + for (lp = (long *)va; lp < end; lp++) { + if (*lp == WEIRD_ADDR) + continue; + printf("%s %d of object 0x%x size %d %s %s (0x%x != 0x%x)\n", + "Data modified on freelist: word", lp - (long *)va, + va, size, "previous type", savedtype, *lp, WEIRD_ADDR); + break; + } + freep->spare0 = 0; +#endif /* DIAGNOSTIC */ +#ifdef KMEMSTATS + kup = btokup(va); + if (kup->ku_indx != indx) + panic("malloc: wrong bucket"); + if (kup->ku_freecnt == 0) + panic("malloc: lost data"); + kup->ku_freecnt--; + kbp->kb_totalfree--; + ksp->ks_memuse += 1 << indx; +out: + kbp->kb_calls++; + ksp->ks_inuse++; + ksp->ks_calls++; + if (ksp->ks_memuse > ksp->ks_maxused) + ksp->ks_maxused = ksp->ks_memuse; +#else +out: +#endif + splx(s); +#ifdef DEBUG + if (flags & M_NOWAIT) + simplelockrecurse--; +#endif + return ((void *) va); +} + +/* + * Free a block of memory allocated by malloc. + */ +void +free(addr, type) + void *addr; + int type; +{ + register struct kmembuckets *kbp; + register struct kmemusage *kup; + register struct freelist *freep; + long size; + int s; +#ifdef DIAGNOSTIC + caddr_t cp; + long *end, *lp, alloc, copysize; +#endif +#ifdef KMEMSTATS + register struct kmemstats *ksp = &kmemstats[type]; +#endif + + kup = btokup(addr); + size = 1 << kup->ku_indx; + kbp = &bucket[kup->ku_indx]; + s = splimp(); +#ifdef DIAGNOSTIC + /* + * Check for returns of data that do not point to the + * beginning of the allocation. + */ + if (size > NBPG * CLSIZE) + alloc = addrmask[BUCKETINDX(NBPG * CLSIZE)]; + else + alloc = addrmask[kup->ku_indx]; + if (((u_long)addr & alloc) != 0) + panic("free: unaligned addr 0x%x, size %d, type %s, mask %d\n", + addr, size, memname[type], alloc); +#endif /* DIAGNOSTIC */ + if (size > MAXALLOCSAVE) { + kmem_free(kmem_map, (vm_offset_t)addr, ctob(kup->ku_pagecnt)); +#ifdef KMEMSTATS + size = kup->ku_pagecnt << PGSHIFT; + ksp->ks_memuse -= size; + kup->ku_indx = 0; + kup->ku_pagecnt = 0; + if (ksp->ks_memuse + size >= ksp->ks_limit && + ksp->ks_memuse < ksp->ks_limit) + wakeup((caddr_t)ksp); + ksp->ks_inuse--; + kbp->kb_total -= 1; +#endif + splx(s); + return; + } + freep = (struct freelist *)addr; +#ifdef DIAGNOSTIC + /* + * Check for multiple frees. Use a quick check to see if + * it looks free before laboriously searching the freelist. + */ + if (freep->spare0 == WEIRD_ADDR) { + for (cp = kbp->kb_next; cp; cp = *(caddr_t *)cp) { + if (addr != cp) + continue; + printf("multiply freed item 0x%x\n", addr); + panic("free: duplicated free"); + } + } + /* + * Copy in known text to detect modification after freeing + * and to make it look free. Also, save the type being freed + * so we can list likely culprit if modification is detected + * when the object is reallocated. + */ + copysize = size < MAX_COPY ? size : MAX_COPY; + end = (long *)&((caddr_t)addr)[copysize]; + for (lp = (long *)addr; lp < end; lp++) + *lp = WEIRD_ADDR; + freep->type = type; +#endif /* DIAGNOSTIC */ +#ifdef KMEMSTATS + kup->ku_freecnt++; + if (kup->ku_freecnt >= kbp->kb_elmpercl) + if (kup->ku_freecnt > kbp->kb_elmpercl) + panic("free: multiple frees"); + else if (kbp->kb_totalfree > kbp->kb_highwat) + kbp->kb_couldfree++; + kbp->kb_totalfree++; + ksp->ks_memuse -= size; + if (ksp->ks_memuse + size >= ksp->ks_limit && + ksp->ks_memuse < ksp->ks_limit) + wakeup((caddr_t)ksp); + ksp->ks_inuse--; +#endif + if (kbp->kb_next == NULL) + kbp->kb_next = addr; + else + ((struct freelist *)kbp->kb_last)->next = addr; + freep->next = NULL; + kbp->kb_last = addr; + splx(s); +} + +/* + * Initialize the kernel memory allocator + */ +kmeminit() +{ + register long indx; + int npg; + +#if ((MAXALLOCSAVE & (MAXALLOCSAVE - 1)) != 0) + ERROR!_kmeminit:_MAXALLOCSAVE_not_power_of_2 +#endif +#if (MAXALLOCSAVE > MINALLOCSIZE * 32768) + ERROR!_kmeminit:_MAXALLOCSAVE_too_big +#endif +#if (MAXALLOCSAVE < CLBYTES) + ERROR!_kmeminit:_MAXALLOCSAVE_too_small +#endif + npg = VM_KMEM_SIZE/ NBPG; + kmemusage = (struct kmemusage *) kmem_alloc(kernel_map, + (vm_size_t)(npg * sizeof(struct kmemusage))); + kmem_map = kmem_suballoc(kernel_map, (vm_offset_t *)&kmembase, + (vm_offset_t *)&kmemlimit, (vm_size_t)(npg * NBPG), FALSE); +#ifdef KMEMSTATS + for (indx = 0; indx < MINBUCKET + 16; indx++) { + if (1 << indx >= CLBYTES) + bucket[indx].kb_elmpercl = 1; + else + bucket[indx].kb_elmpercl = CLBYTES / (1 << indx); + bucket[indx].kb_highwat = 5 * bucket[indx].kb_elmpercl; + } + for (indx = 0; indx < M_LAST; indx++) + kmemstats[indx].ks_limit = npg * NBPG * 6 / 10; +#endif +} diff --git a/sys/kern/kern_proc.c b/sys/kern/kern_proc.c new file mode 100644 index 000000000000..67017933bc1d --- /dev/null +++ b/sys/kern/kern_proc.c @@ -0,0 +1,374 @@ +/* + * Copyright (c) 1982, 1986, 1989, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)kern_proc.c 8.7 (Berkeley) 2/14/95 + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/map.h> +#include <sys/kernel.h> +#include <sys/proc.h> +#include <sys/buf.h> +#include <sys/acct.h> +#include <sys/wait.h> +#include <sys/file.h> +#include <ufs/ufs/quota.h> +#include <sys/uio.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/ioctl.h> +#include <sys/tty.h> + +/* + * Structure associated with user cacheing. + */ +struct uidinfo { + LIST_ENTRY(uidinfo) ui_hash; + uid_t ui_uid; + long ui_proccnt; +}; +#define UIHASH(uid) (&uihashtbl[(uid) & uihash]) +LIST_HEAD(uihashhead, uidinfo) *uihashtbl; +u_long uihash; /* size of hash table - 1 */ + +/* + * Other process lists + */ +struct pidhashhead *pidhashtbl; +u_long pidhash; +struct pgrphashhead *pgrphashtbl; +u_long pgrphash; +struct proclist allproc; +struct proclist zombproc; + +/* + * Initialize global process hashing structures. + */ +void +procinit() +{ + + LIST_INIT(&allproc); + LIST_INIT(&zombproc); + pidhashtbl = hashinit(maxproc / 4, M_PROC, &pidhash); + pgrphashtbl = hashinit(maxproc / 4, M_PROC, &pgrphash); + uihashtbl = hashinit(maxproc / 16, M_PROC, &uihash); +} + +/* + * Change the count associated with number of processes + * a given user is using. + */ +int +chgproccnt(uid, diff) + uid_t uid; + int diff; +{ + register struct uidinfo *uip; + register struct uihashhead *uipp; + + uipp = UIHASH(uid); + for (uip = uipp->lh_first; uip != 0; uip = uip->ui_hash.le_next) + if (uip->ui_uid == uid) + break; + if (uip) { + uip->ui_proccnt += diff; + if (uip->ui_proccnt > 0) + return (uip->ui_proccnt); + if (uip->ui_proccnt < 0) + panic("chgproccnt: procs < 0"); + LIST_REMOVE(uip, ui_hash); + FREE(uip, M_PROC); + return (0); + } + if (diff <= 0) { + if (diff == 0) + return(0); + panic("chgproccnt: lost user"); + } + MALLOC(uip, struct uidinfo *, sizeof(*uip), M_PROC, M_WAITOK); + LIST_INSERT_HEAD(uipp, uip, ui_hash); + uip->ui_uid = uid; + uip->ui_proccnt = diff; + return (diff); +} + +/* + * Is p an inferior of the current process? + */ +inferior(p) + register struct proc *p; +{ + + for (; p != curproc; p = p->p_pptr) + if (p->p_pid == 0) + return (0); + return (1); +} + +/* + * Locate a process by number + */ +struct proc * +pfind(pid) + register pid_t pid; +{ + register struct proc *p; + + for (p = PIDHASH(pid)->lh_first; p != 0; p = p->p_hash.le_next) + if (p->p_pid == pid) + return (p); + return (NULL); +} + +/* + * Locate a process group by number + */ +struct pgrp * +pgfind(pgid) + register pid_t pgid; +{ + register struct pgrp *pgrp; + + for (pgrp = PGRPHASH(pgid)->lh_first; pgrp != 0; + pgrp = pgrp->pg_hash.le_next) + if (pgrp->pg_id == pgid) + return (pgrp); + return (NULL); +} + +/* + * Move p to a new or existing process group (and session) + */ +int +enterpgrp(p, pgid, mksess) + register struct proc *p; + pid_t pgid; + int mksess; +{ + register struct pgrp *pgrp = pgfind(pgid); + +#ifdef DIAGNOSTIC + if (pgrp != NULL && mksess) /* firewalls */ + panic("enterpgrp: setsid into non-empty pgrp"); + if (SESS_LEADER(p)) + panic("enterpgrp: session leader attempted setpgrp"); +#endif + if (pgrp == NULL) { + pid_t savepid = p->p_pid; + struct proc *np; + /* + * new process group + */ +#ifdef DIAGNOSTIC + if (p->p_pid != pgid) + panic("enterpgrp: new pgrp and pid != pgid"); +#endif + MALLOC(pgrp, struct pgrp *, sizeof(struct pgrp), M_PGRP, + M_WAITOK); + if ((np = pfind(savepid)) == NULL || np != p) + return (ESRCH); + if (mksess) { + register struct session *sess; + + /* + * new session + */ + MALLOC(sess, struct session *, sizeof(struct session), + M_SESSION, M_WAITOK); + sess->s_leader = p; + sess->s_count = 1; + sess->s_ttyvp = NULL; + sess->s_ttyp = NULL; + bcopy(p->p_session->s_login, sess->s_login, + sizeof(sess->s_login)); + p->p_flag &= ~P_CONTROLT; + pgrp->pg_session = sess; +#ifdef DIAGNOSTIC + if (p != curproc) + panic("enterpgrp: mksession and p != curproc"); +#endif + } else { + pgrp->pg_session = p->p_session; + pgrp->pg_session->s_count++; + } + pgrp->pg_id = pgid; + LIST_INIT(&pgrp->pg_members); + LIST_INSERT_HEAD(PGRPHASH(pgid), pgrp, pg_hash); + pgrp->pg_jobc = 0; + } else if (pgrp == p->p_pgrp) + return (0); + + /* + * Adjust eligibility of affected pgrps to participate in job control. + * Increment eligibility counts before decrementing, otherwise we + * could reach 0 spuriously during the first call. + */ + fixjobc(p, pgrp, 1); + fixjobc(p, p->p_pgrp, 0); + + LIST_REMOVE(p, p_pglist); + if (p->p_pgrp->pg_members.lh_first == 0) + pgdelete(p->p_pgrp); + p->p_pgrp = pgrp; + LIST_INSERT_HEAD(&pgrp->pg_members, p, p_pglist); + return (0); +} + +/* + * remove process from process group + */ +int +leavepgrp(p) + register struct proc *p; +{ + + LIST_REMOVE(p, p_pglist); + if (p->p_pgrp->pg_members.lh_first == 0) + pgdelete(p->p_pgrp); + p->p_pgrp = 0; + return (0); +} + +/* + * delete a process group + */ +void +pgdelete(pgrp) + register struct pgrp *pgrp; +{ + + if (pgrp->pg_session->s_ttyp != NULL && + pgrp->pg_session->s_ttyp->t_pgrp == pgrp) + pgrp->pg_session->s_ttyp->t_pgrp = NULL; + LIST_REMOVE(pgrp, pg_hash); + if (--pgrp->pg_session->s_count == 0) + FREE(pgrp->pg_session, M_SESSION); + FREE(pgrp, M_PGRP); +} + +static void orphanpg(); + +/* + * Adjust pgrp jobc counters when specified process changes process group. + * We count the number of processes in each process group that "qualify" + * the group for terminal job control (those with a parent in a different + * process group of the same session). If that count reaches zero, the + * process group becomes orphaned. Check both the specified process' + * process group and that of its children. + * entering == 0 => p is leaving specified group. + * entering == 1 => p is entering specified group. + */ +void +fixjobc(p, pgrp, entering) + register struct proc *p; + register struct pgrp *pgrp; + int entering; +{ + register struct pgrp *hispgrp; + register struct session *mysession = pgrp->pg_session; + + /* + * Check p's parent to see whether p qualifies its own process + * group; if so, adjust count for p's process group. + */ + if ((hispgrp = p->p_pptr->p_pgrp) != pgrp && + hispgrp->pg_session == mysession) + if (entering) + pgrp->pg_jobc++; + else if (--pgrp->pg_jobc == 0) + orphanpg(pgrp); + + /* + * Check this process' children to see whether they qualify + * their process groups; if so, adjust counts for children's + * process groups. + */ + for (p = p->p_children.lh_first; p != 0; p = p->p_sibling.le_next) + if ((hispgrp = p->p_pgrp) != pgrp && + hispgrp->pg_session == mysession && + p->p_stat != SZOMB) + if (entering) + hispgrp->pg_jobc++; + else if (--hispgrp->pg_jobc == 0) + orphanpg(hispgrp); +} + +/* + * A process group has become orphaned; + * if there are any stopped processes in the group, + * hang-up all process in that group. + */ +static void +orphanpg(pg) + struct pgrp *pg; +{ + register struct proc *p; + + for (p = pg->pg_members.lh_first; p != 0; p = p->p_pglist.le_next) { + if (p->p_stat == SSTOP) { + for (p = pg->pg_members.lh_first; p != 0; + p = p->p_pglist.le_next) { + psignal(p, SIGHUP); + psignal(p, SIGCONT); + } + return; + } + } +} + +#ifdef DEBUG +pgrpdump() +{ + register struct pgrp *pgrp; + register struct proc *p; + register i; + + for (i = 0; i <= pgrphash; i++) { + if (pgrp = pgrphashtbl[i].lh_first) { + printf("\tindx %d\n", i); + for (; pgrp != 0; pgrp = pgrp->pg_hash.le_next) { + printf("\tpgrp %x, pgid %d, sess %x, sesscnt %d, mem %x\n", + pgrp, pgrp->pg_id, pgrp->pg_session, + pgrp->pg_session->s_count, + pgrp->pg_members.lh_first); + for (p = pgrp->pg_members.lh_first; p != 0; + p = p->p_pglist.le_next) { + printf("\t\tpid %d addr %x pgrp %x\n", + p->p_pid, p, p->p_pgrp); + } + } + } + } +} +#endif /* DEBUG */ diff --git a/sys/kern/kern_prot.c b/sys/kern/kern_prot.c new file mode 100644 index 000000000000..29e4c679c6f2 --- /dev/null +++ b/sys/kern/kern_prot.c @@ -0,0 +1,601 @@ +/* + * Copyright (c) 1982, 1986, 1989, 1990, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)kern_prot.c 8.9 (Berkeley) 2/14/95 + */ + +/* + * System calls related to processes and protection + */ + +#include <sys/param.h> +#include <sys/acct.h> +#include <sys/systm.h> +#include <sys/ucred.h> +#include <sys/proc.h> +#include <sys/timeb.h> +#include <sys/times.h> +#include <sys/malloc.h> + +#include <sys/mount.h> +#include <sys/syscallargs.h> + +/* ARGSUSED */ +int +getpid(p, uap, retval) + struct proc *p; + void *uap; + register_t *retval; +{ + + *retval = p->p_pid; +#if defined(COMPAT_43) || defined(COMPAT_SUNOS) + retval[1] = p->p_pptr->p_pid; +#endif + return (0); +} + +/* ARGSUSED */ +int +getppid(p, uap, retval) + struct proc *p; + void *uap; + register_t *retval; +{ + + *retval = p->p_pptr->p_pid; + return (0); +} + +/* Get process group ID; note that POSIX getpgrp takes no parameter */ +int +getpgrp(p, uap, retval) + struct proc *p; + void *uap; + register_t *retval; +{ + + *retval = p->p_pgrp->pg_id; + return (0); +} + +/* ARGSUSED */ +int +getuid(p, uap, retval) + struct proc *p; + void *uap; + register_t *retval; +{ + + *retval = p->p_cred->p_ruid; +#if defined(COMPAT_43) || defined(COMPAT_SUNOS) + retval[1] = p->p_ucred->cr_uid; +#endif + return (0); +} + +/* ARGSUSED */ +int +geteuid(p, uap, retval) + struct proc *p; + void *uap; + register_t *retval; +{ + + *retval = p->p_ucred->cr_uid; + return (0); +} + +/* ARGSUSED */ +int +getgid(p, uap, retval) + struct proc *p; + void *uap; + register_t *retval; +{ + + *retval = p->p_cred->p_rgid; +#if defined(COMPAT_43) || defined(COMPAT_SUNOS) + retval[1] = p->p_ucred->cr_groups[0]; +#endif + return (0); +} + +/* + * Get effective group ID. The "egid" is groups[0], and could be obtained + * via getgroups. This syscall exists because it is somewhat painful to do + * correctly in a library function. + */ +/* ARGSUSED */ +int +getegid(p, uap, retval) + struct proc *p; + void *uap; + register_t *retval; +{ + + *retval = p->p_ucred->cr_groups[0]; + return (0); +} + +int +getgroups(p, uap, retval) + struct proc *p; + register struct getgroups_args /* { + syscallarg(u_int) gidsetsize; + syscallarg(gid_t *) gidset; + } */ *uap; + register_t *retval; +{ + register struct pcred *pc = p->p_cred; + register u_int ngrp; + int error; + + if ((ngrp = SCARG(uap, gidsetsize)) == 0) { + *retval = pc->pc_ucred->cr_ngroups; + return (0); + } + if (ngrp < pc->pc_ucred->cr_ngroups) + return (EINVAL); + ngrp = pc->pc_ucred->cr_ngroups; + if (error = copyout((caddr_t)pc->pc_ucred->cr_groups, + (caddr_t)SCARG(uap, gidset), ngrp * sizeof(gid_t))) + return (error); + *retval = ngrp; + return (0); +} + +/* ARGSUSED */ +int +setsid(p, uap, retval) + register struct proc *p; + void *uap; + register_t *retval; +{ + + if (p->p_pgid == p->p_pid || pgfind(p->p_pid)) { + return (EPERM); + } else { + (void)enterpgrp(p, p->p_pid, 1); + *retval = p->p_pid; + return (0); + } +} + +/* + * set process group (setpgid/old setpgrp) + * + * caller does setpgid(targpid, targpgid) + * + * pid must be caller or child of caller (ESRCH) + * if a child + * pid must be in same session (EPERM) + * pid can't have done an exec (EACCES) + * if pgid != pid + * there must exist some pid in same session having pgid (EPERM) + * pid must not be session leader (EPERM) + */ +/* ARGSUSED */ +int +setpgid(curp, uap, retval) + struct proc *curp; + register struct setpgid_args /* { + syscallarg(int) pid; + syscallarg(int) pgid; + } */ *uap; + register_t *retval; +{ + register struct proc *targp; /* target process */ + register struct pgrp *pgrp; /* target pgrp */ + + if (SCARG(uap, pid) != 0 && SCARG(uap, pid) != curp->p_pid) { + if ((targp = pfind(SCARG(uap, pid))) == 0 || !inferior(targp)) + return (ESRCH); + if (targp->p_session != curp->p_session) + return (EPERM); + if (targp->p_flag & P_EXEC) + return (EACCES); + } else + targp = curp; + if (SESS_LEADER(targp)) + return (EPERM); + if (SCARG(uap, pgid) == 0) + SCARG(uap, pgid) = targp->p_pid; + else if (SCARG(uap, pgid) != targp->p_pid) + if ((pgrp = pgfind(SCARG(uap, pgid))) == 0 || + pgrp->pg_session != curp->p_session) + return (EPERM); + return (enterpgrp(targp, SCARG(uap, pgid), 0)); +} + +/* ARGSUSED */ +int +setuid(p, uap, retval) + struct proc *p; + struct setuid_args /* { + syscallarg(uid_t) uid; + } */ *uap; + register_t *retval; +{ + register struct pcred *pc = p->p_cred; + register uid_t uid; + int error; + + uid = SCARG(uap, uid); + if (uid != pc->p_ruid && + (error = suser(pc->pc_ucred, &p->p_acflag))) + return (error); + /* + * Everything's okay, do it. + * Transfer proc count to new user. + * Copy credentials so other references do not see our changes. + */ + (void)chgproccnt(pc->p_ruid, -1); + (void)chgproccnt(uid, 1); + pc->pc_ucred = crcopy(pc->pc_ucred); + pc->pc_ucred->cr_uid = uid; + pc->p_ruid = uid; + pc->p_svuid = uid; + p->p_flag |= P_SUGID; + return (0); +} + +/* ARGSUSED */ +int +seteuid(p, uap, retval) + struct proc *p; + struct seteuid_args /* { + syscallarg(uid_t) euid; + } */ *uap; + register_t *retval; +{ + register struct pcred *pc = p->p_cred; + register uid_t euid; + int error; + + euid = SCARG(uap, euid); + if (euid != pc->p_ruid && euid != pc->p_svuid && + (error = suser(pc->pc_ucred, &p->p_acflag))) + return (error); + /* + * Everything's okay, do it. Copy credentials so other references do + * not see our changes. + */ + pc->pc_ucred = crcopy(pc->pc_ucred); + pc->pc_ucred->cr_uid = euid; + p->p_flag |= P_SUGID; + return (0); +} + +/* ARGSUSED */ +int +setgid(p, uap, retval) + struct proc *p; + struct setgid_args /* { + syscallarg(gid_t) gid; + } */ *uap; + register_t *retval; +{ + register struct pcred *pc = p->p_cred; + register gid_t gid; + int error; + + gid = SCARG(uap, gid); + if (gid != pc->p_rgid && (error = suser(pc->pc_ucred, &p->p_acflag))) + return (error); + pc->pc_ucred = crcopy(pc->pc_ucred); + pc->pc_ucred->cr_groups[0] = gid; + pc->p_rgid = gid; + pc->p_svgid = gid; /* ??? */ + p->p_flag |= P_SUGID; + return (0); +} + +/* ARGSUSED */ +int +setegid(p, uap, retval) + struct proc *p; + struct setegid_args /* { + syscallarg(gid_t) egid; + } */ *uap; + register_t *retval; +{ + register struct pcred *pc = p->p_cred; + register gid_t egid; + int error; + + egid = SCARG(uap, egid); + if (egid != pc->p_rgid && egid != pc->p_svgid && + (error = suser(pc->pc_ucred, &p->p_acflag))) + return (error); + pc->pc_ucred = crcopy(pc->pc_ucred); + pc->pc_ucred->cr_groups[0] = egid; + p->p_flag |= P_SUGID; + return (0); +} + +/* ARGSUSED */ +int +setgroups(p, uap, retval) + struct proc *p; + struct setgroups_args /* { + syscallarg(u_int) gidsetsize; + syscallarg(gid_t *) gidset; + } */ *uap; + register_t *retval; +{ + register struct pcred *pc = p->p_cred; + register u_int ngrp; + int error; + + if (error = suser(pc->pc_ucred, &p->p_acflag)) + return (error); + ngrp = SCARG(uap, gidsetsize); + if (ngrp < 1 || ngrp > NGROUPS) + return (EINVAL); + pc->pc_ucred = crcopy(pc->pc_ucred); + if (error = copyin((caddr_t)SCARG(uap, gidset), + (caddr_t)pc->pc_ucred->cr_groups, ngrp * sizeof(gid_t))) + return (error); + pc->pc_ucred->cr_ngroups = ngrp; + p->p_flag |= P_SUGID; + return (0); +} + +#if defined(COMPAT_43) || defined(COMPAT_SUNOS) +/* ARGSUSED */ +int +compat_43_setreuid(p, uap, retval) + register struct proc *p; + struct compat_43_setreuid_args /* { + syscallarg(int) ruid; + syscallarg(int) euid; + } */ *uap; + register_t *retval; +{ + register struct pcred *pc = p->p_cred; + union { + struct setuid_args sa; + struct seteuid_args ea; + } args; + + /* + * If ruid == euid then setreuid is being used to emulate setuid, + * just do it. + */ + if (SCARG(uap, ruid) != -1 && SCARG(uap, ruid) == SCARG(uap, euid)) { + SCARG(&args.sa, uid) = SCARG(uap, ruid); + return (setuid(p, &args.sa, retval)); + } + /* + * Otherwise we assume that the intent of setting ruid is to be + * able to get back ruid priviledge (i.e. swapping ruid and euid). + * So we make sure that we will be able to do so, but do not + * actually set the ruid. + */ + if (SCARG(uap, ruid) != (uid_t)-1 && SCARG(uap, ruid) != pc->p_ruid && + SCARG(uap, ruid) != pc->p_svuid) + return (EPERM); + if (SCARG(uap, euid) == (uid_t)-1) + return (0); + SCARG(&args.ea, euid) = SCARG(uap, euid); + return (seteuid(p, &args.ea, retval)); +} + +/* ARGSUSED */ +int +compat_43_setregid(p, uap, retval) + register struct proc *p; + struct compat_43_setregid_args /* { + syscallarg(int) rgid; + syscallarg(int) egid; + } */ *uap; + register_t *retval; +{ + register struct pcred *pc = p->p_cred; + union { + struct setgid_args sa; + struct setegid_args ea; + } args; + + /* + * If rgid == egid then setreuid is being used to emulate setgid, + * just do it. + */ + if (SCARG(uap, rgid) != -1 && SCARG(uap, rgid) == SCARG(uap, egid)) { + SCARG(&args.sa, gid) = SCARG(uap, rgid); + return (setgid(p, &args.sa, retval)); + } + /* + * Otherwise we assume that the intent of setting rgid is to be + * able to get back rgid priviledge (i.e. swapping rgid and egid). + * So we make sure that we will be able to do so, but do not + * actually set the rgid. + */ + if (SCARG(uap, rgid) != (gid_t)-1 && SCARG(uap, rgid) != pc->p_rgid && + SCARG(uap, rgid) != pc->p_svgid) + return (EPERM); + if (SCARG(uap, egid) == (gid_t)-1) + return (0); + SCARG(&args.ea, egid) = SCARG(uap, egid); + return (setegid(p, &args.ea, retval)); +} +#endif /* defined(COMPAT_43) || defined(COMPAT_SUNOS) */ + +/* + * Check if gid is a member of the group set. + */ +int +groupmember(gid, cred) + gid_t gid; + register struct ucred *cred; +{ + register gid_t *gp; + gid_t *egp; + + egp = &(cred->cr_groups[cred->cr_ngroups]); + for (gp = cred->cr_groups; gp < egp; gp++) + if (*gp == gid) + return (1); + return (0); +} + +/* + * Test whether the specified credentials imply "super-user" + * privilege; if so, and we have accounting info, set the flag + * indicating use of super-powers. + * Returns 0 or error. + */ +int +suser(cred, acflag) + struct ucred *cred; + u_short *acflag; +{ + if (cred->cr_uid == 0) { + if (acflag) + *acflag |= ASU; + return (0); + } + return (EPERM); +} + +/* + * Allocate a zeroed cred structure. + */ +struct ucred * +crget() +{ + register struct ucred *cr; + + MALLOC(cr, struct ucred *, sizeof(*cr), M_CRED, M_WAITOK); + bzero((caddr_t)cr, sizeof(*cr)); + cr->cr_ref = 1; + return (cr); +} + +/* + * Free a cred structure. + * Throws away space when ref count gets to 0. + */ +void +crfree(cr) + struct ucred *cr; +{ + int s; + + s = splimp(); /* ??? */ + if (--cr->cr_ref == 0) + FREE((caddr_t)cr, M_CRED); + (void) splx(s); +} + +/* + * Copy cred structure to a new one and free the old one. + */ +struct ucred * +crcopy(cr) + struct ucred *cr; +{ + struct ucred *newcr; + + if (cr->cr_ref == 1) + return (cr); + newcr = crget(); + *newcr = *cr; + crfree(cr); + newcr->cr_ref = 1; + return (newcr); +} + +/* + * Dup cred struct to a new held one. + */ +struct ucred * +crdup(cr) + struct ucred *cr; +{ + struct ucred *newcr; + + newcr = crget(); + *newcr = *cr; + newcr->cr_ref = 1; + return (newcr); +} + +/* + * Get login name, if available. + */ +/* ARGSUSED */ +int +getlogin(p, uap, retval) + struct proc *p; + struct getlogin_args /* { + syscallarg(char *) namebuf; + syscallarg(u_int) namelen; + } */ *uap; + register_t *retval; +{ + + if (SCARG(uap, namelen) > sizeof (p->p_pgrp->pg_session->s_login)) + SCARG(uap, namelen) = sizeof (p->p_pgrp->pg_session->s_login); + return (copyout((caddr_t) p->p_pgrp->pg_session->s_login, + (caddr_t) SCARG(uap, namebuf), SCARG(uap, namelen))); +} + +/* + * Set login name. + */ +/* ARGSUSED */ +int +setlogin(p, uap, retval) + struct proc *p; + struct setlogin_args /* { + syscallarg(char *) namebuf; + } */ *uap; + register_t *retval; +{ + int error; + + if (error = suser(p->p_ucred, &p->p_acflag)) + return (error); + error = copyinstr((caddr_t) SCARG(uap, namebuf), + (caddr_t) p->p_pgrp->pg_session->s_login, + sizeof (p->p_pgrp->pg_session->s_login) - 1, (u_int *)0); + if (error == ENAMETOOLONG) + error = EINVAL; + return (error); +} diff --git a/sys/kern/kern_resource.c b/sys/kern/kern_resource.c new file mode 100644 index 000000000000..569b9d973a10 --- /dev/null +++ b/sys/kern/kern_resource.c @@ -0,0 +1,489 @@ +/*- + * Copyright (c) 1982, 1986, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)kern_resource.c 8.8 (Berkeley) 2/14/95 + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/file.h> +#include <sys/resourcevar.h> +#include <sys/malloc.h> +#include <sys/proc.h> + +#include <sys/mount.h> +#include <sys/syscallargs.h> + +#include <vm/vm.h> + +int donice __P((struct proc *curp, struct proc *chgp, int n)); +int dosetrlimit __P((struct proc *p, u_int which, struct rlimit *limp)); + +/* + * Resource controls and accounting. + */ + +int +getpriority(curp, uap, retval) + struct proc *curp; + register struct getpriority_args /* { + syscallarg(int) which; + syscallarg(int) who; + } */ *uap; + register_t *retval; +{ + register struct proc *p; + register int low = PRIO_MAX + 1; + + switch (SCARG(uap, which)) { + + case PRIO_PROCESS: + if (SCARG(uap, who) == 0) + p = curp; + else + p = pfind(SCARG(uap, who)); + if (p == 0) + break; + low = p->p_nice; + break; + + case PRIO_PGRP: { + register struct pgrp *pg; + + if (SCARG(uap, who) == 0) + pg = curp->p_pgrp; + else if ((pg = pgfind(SCARG(uap, who))) == NULL) + break; + for (p = pg->pg_members.lh_first; p != 0; + p = p->p_pglist.le_next) { + if (p->p_nice < low) + low = p->p_nice; + } + break; + } + + case PRIO_USER: + if (SCARG(uap, who) == 0) + SCARG(uap, who) = curp->p_ucred->cr_uid; + for (p = allproc.lh_first; p != 0; p = p->p_list.le_next) + if (p->p_ucred->cr_uid == SCARG(uap, who) && + p->p_nice < low) + low = p->p_nice; + break; + + default: + return (EINVAL); + } + if (low == PRIO_MAX + 1) + return (ESRCH); + *retval = low; + return (0); +} + +/* ARGSUSED */ +int +setpriority(curp, uap, retval) + struct proc *curp; + register struct setpriority_args /* { + syscallarg(int) which; + syscallarg(int) who; + syscallarg(int) prio; + } */ *uap; + register_t *retval; +{ + register struct proc *p; + int found = 0, error = 0; + + switch (SCARG(uap, which)) { + + case PRIO_PROCESS: + if (SCARG(uap, who) == 0) + p = curp; + else + p = pfind(SCARG(uap, who)); + if (p == 0) + break; + error = donice(curp, p, SCARG(uap, prio)); + found++; + break; + + case PRIO_PGRP: { + register struct pgrp *pg; + + if (SCARG(uap, who) == 0) + pg = curp->p_pgrp; + else if ((pg = pgfind(SCARG(uap, who))) == NULL) + break; + for (p = pg->pg_members.lh_first; p != 0; + p = p->p_pglist.le_next) { + error = donice(curp, p, SCARG(uap, prio)); + found++; + } + break; + } + + case PRIO_USER: + if (SCARG(uap, who) == 0) + SCARG(uap, who) = curp->p_ucred->cr_uid; + for (p = allproc.lh_first; p != 0; p = p->p_list.le_next) + if (p->p_ucred->cr_uid == SCARG(uap, who)) { + error = donice(curp, p, SCARG(uap, prio)); + found++; + } + break; + + default: + return (EINVAL); + } + if (found == 0) + return (ESRCH); + return (error); +} + +int +donice(curp, chgp, n) + register struct proc *curp, *chgp; + register int n; +{ + register struct pcred *pcred = curp->p_cred; + + if (pcred->pc_ucred->cr_uid && pcred->p_ruid && + pcred->pc_ucred->cr_uid != chgp->p_ucred->cr_uid && + pcred->p_ruid != chgp->p_ucred->cr_uid) + return (EPERM); + if (n > PRIO_MAX) + n = PRIO_MAX; + if (n < PRIO_MIN) + n = PRIO_MIN; + if (n < chgp->p_nice && suser(pcred->pc_ucred, &curp->p_acflag)) + return (EACCES); + chgp->p_nice = n; + (void)resetpriority(chgp); + return (0); +} + +#if defined(COMPAT_43) || defined(COMPAT_SUNOS) +/* ARGSUSED */ +int +compat_43_setrlimit(p, uap, retval) + struct proc *p; + struct compat_43_setrlimit_args /* { + syscallarg(u_int) which; + syscallarg(struct ogetrlimit *) rlp; + } */ *uap; + register_t *retval; +{ + struct orlimit olim; + struct rlimit lim; + int error; + + if (error = copyin((caddr_t)SCARG(uap, rlp), (caddr_t)&olim, + sizeof (struct orlimit))) + return (error); + lim.rlim_cur = olim.rlim_cur; + lim.rlim_max = olim.rlim_max; + return (dosetrlimit(p, SCARG(uap, which), &lim)); +} + +/* ARGSUSED */ +int +compat_43_getrlimit(p, uap, retval) + struct proc *p; + register struct compat_43_getrlimit_args /* { + syscallarg(u_int) which; + syscallarg(struct ogetrlimit *) rlp; + } */ *uap; + register_t *retval; +{ + struct orlimit olim; + + if (SCARG(uap, which) >= RLIM_NLIMITS) + return (EINVAL); + olim.rlim_cur = p->p_rlimit[SCARG(uap, which)].rlim_cur; + if (olim.rlim_cur == -1) + olim.rlim_cur = 0x7fffffff; + olim.rlim_max = p->p_rlimit[SCARG(uap, which)].rlim_max; + if (olim.rlim_max == -1) + olim.rlim_max = 0x7fffffff; + return (copyout((caddr_t)&olim, (caddr_t)SCARG(uap, rlp), + sizeof(olim))); +} +#endif /* COMPAT_43 || COMPAT_SUNOS */ + +/* ARGSUSED */ +int +setrlimit(p, uap, retval) + struct proc *p; + register struct setrlimit_args /* { + syscallarg(u_int) which; + syscallarg(struct rlimit *) rlp; + } */ *uap; + register_t *retval; +{ + struct rlimit alim; + int error; + + if (error = copyin((caddr_t)SCARG(uap, rlp), (caddr_t)&alim, + sizeof (struct rlimit))) + return (error); + return (dosetrlimit(p, SCARG(uap, which), &alim)); +} + +int +dosetrlimit(p, which, limp) + struct proc *p; + u_int which; + struct rlimit *limp; +{ + register struct rlimit *alimp; + extern unsigned maxdmap; + int error; + + if (which >= RLIM_NLIMITS) + return (EINVAL); + alimp = &p->p_rlimit[which]; + if (limp->rlim_cur > alimp->rlim_max || + limp->rlim_max > alimp->rlim_max) + if (error = suser(p->p_ucred, &p->p_acflag)) + return (error); + if (limp->rlim_cur > limp->rlim_max) + limp->rlim_cur = limp->rlim_max; + if (p->p_limit->p_refcnt > 1 && + (p->p_limit->p_lflags & PL_SHAREMOD) == 0) { + p->p_limit->p_refcnt--; + p->p_limit = limcopy(p->p_limit); + alimp = &p->p_rlimit[which]; + } + + switch (which) { + + case RLIMIT_DATA: + if (limp->rlim_cur > maxdmap) + limp->rlim_cur = maxdmap; + if (limp->rlim_max > maxdmap) + limp->rlim_max = maxdmap; + break; + + case RLIMIT_STACK: + if (limp->rlim_cur > maxdmap) + limp->rlim_cur = maxdmap; + if (limp->rlim_max > maxdmap) + limp->rlim_max = maxdmap; + /* + * Stack is allocated to the max at exec time with only + * "rlim_cur" bytes accessible. If stack limit is going + * up make more accessible, if going down make inaccessible. + */ + if (limp->rlim_cur != alimp->rlim_cur) { + vm_offset_t addr; + vm_size_t size; + vm_prot_t prot; + + if (limp->rlim_cur > alimp->rlim_cur) { + prot = VM_PROT_ALL; + size = limp->rlim_cur - alimp->rlim_cur; + addr = USRSTACK - limp->rlim_cur; + } else { + prot = VM_PROT_NONE; + size = alimp->rlim_cur - limp->rlim_cur; + addr = USRSTACK - alimp->rlim_cur; + } + addr = trunc_page(addr); + size = round_page(size); + (void) vm_map_protect(&p->p_vmspace->vm_map, + addr, addr+size, prot, FALSE); + } + break; + + case RLIMIT_NOFILE: + if (limp->rlim_cur > maxfiles) + limp->rlim_cur = maxfiles; + if (limp->rlim_max > maxfiles) + limp->rlim_max = maxfiles; + break; + + case RLIMIT_NPROC: + if (limp->rlim_cur > maxproc) + limp->rlim_cur = maxproc; + if (limp->rlim_max > maxproc) + limp->rlim_max = maxproc; + break; + } + *alimp = *limp; + return (0); +} + +/* ARGSUSED */ +int +getrlimit(p, uap, retval) + struct proc *p; + register struct getrlimit_args /* { + syscallarg(u_int) which; + syscallarg(struct rlimit *) rlp; + } */ *uap; + register_t *retval; +{ + + if (SCARG(uap, which) >= RLIM_NLIMITS) + return (EINVAL); + return (copyout((caddr_t)&p->p_rlimit[SCARG(uap, which)], + (caddr_t)SCARG(uap, rlp), sizeof (struct rlimit))); +} + +/* + * Transform the running time and tick information in proc p into user, + * system, and interrupt time usage. + */ +void +calcru(p, up, sp, ip) + register struct proc *p; + register struct timeval *up; + register struct timeval *sp; + register struct timeval *ip; +{ + register u_quad_t u, st, ut, it, tot; + register u_long sec, usec; + register int s; + struct timeval tv; + + s = splstatclock(); + st = p->p_sticks; + ut = p->p_uticks; + it = p->p_iticks; + splx(s); + + tot = st + ut + it; + if (tot == 0) { + up->tv_sec = up->tv_usec = 0; + sp->tv_sec = sp->tv_usec = 0; + if (ip != NULL) + ip->tv_sec = ip->tv_usec = 0; + return; + } + + sec = p->p_rtime.tv_sec; + usec = p->p_rtime.tv_usec; + if (p == curproc) { + /* + * Adjust for the current time slice. This is actually fairly + * important since the error here is on the order of a time + * quantum, which is much greater than the sampling error. + */ + microtime(&tv); + sec += tv.tv_sec - runtime.tv_sec; + usec += tv.tv_usec - runtime.tv_usec; + } + u = sec * 1000000 + usec; + st = (u * st) / tot; + sp->tv_sec = st / 1000000; + sp->tv_usec = st % 1000000; + ut = (u * ut) / tot; + up->tv_sec = ut / 1000000; + up->tv_usec = ut % 1000000; + if (ip != NULL) { + it = (u * it) / tot; + ip->tv_sec = it / 1000000; + ip->tv_usec = it % 1000000; + } +} + +/* ARGSUSED */ +int +getrusage(p, uap, retval) + register struct proc *p; + register struct getrusage_args /* { + syscallarg(int) who; + syscallarg(struct rusage *) rusage; + } */ *uap; + register_t *retval; +{ + register struct rusage *rup; + + switch (SCARG(uap, who)) { + + case RUSAGE_SELF: + rup = &p->p_stats->p_ru; + calcru(p, &rup->ru_utime, &rup->ru_stime, NULL); + break; + + case RUSAGE_CHILDREN: + rup = &p->p_stats->p_cru; + break; + + default: + return (EINVAL); + } + return (copyout((caddr_t)rup, (caddr_t)SCARG(uap, rusage), + sizeof (struct rusage))); +} + +void +ruadd(ru, ru2) + register struct rusage *ru, *ru2; +{ + register long *ip, *ip2; + register int i; + + timevaladd(&ru->ru_utime, &ru2->ru_utime); + timevaladd(&ru->ru_stime, &ru2->ru_stime); + if (ru->ru_maxrss < ru2->ru_maxrss) + ru->ru_maxrss = ru2->ru_maxrss; + ip = &ru->ru_first; ip2 = &ru2->ru_first; + for (i = &ru->ru_last - &ru->ru_first; i >= 0; i--) + *ip++ += *ip2++; +} + +/* + * Make a copy of the plimit structure. + * We share these structures copy-on-write after fork, + * and copy when a limit is changed. + */ +struct plimit * +limcopy(lim) + struct plimit *lim; +{ + register struct plimit *copy; + + MALLOC(copy, struct plimit *, sizeof(struct plimit), + M_SUBPROC, M_WAITOK); + bcopy(lim->pl_rlimit, copy->pl_rlimit, + sizeof(struct rlimit) * RLIM_NLIMITS); + copy->p_lflags = 0; + copy->p_refcnt = 1; + return (copy); +} diff --git a/sys/kern/kern_sig.c b/sys/kern/kern_sig.c new file mode 100644 index 000000000000..5683b9c7c935 --- /dev/null +++ b/sys/kern/kern_sig.c @@ -0,0 +1,1219 @@ +/* + * Copyright (c) 1982, 1986, 1989, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)kern_sig.c 8.14 (Berkeley) 5/14/95 + */ + +#define SIGPROP /* include signal properties table */ +#include <sys/param.h> +#include <sys/signalvar.h> +#include <sys/resourcevar.h> +#include <sys/namei.h> +#include <sys/vnode.h> +#include <sys/proc.h> +#include <sys/systm.h> +#include <sys/timeb.h> +#include <sys/times.h> +#include <sys/buf.h> +#include <sys/acct.h> +#include <sys/file.h> +#include <sys/kernel.h> +#include <sys/wait.h> +#include <sys/ktrace.h> +#include <sys/syslog.h> +#include <sys/stat.h> + +#include <sys/mount.h> +#include <sys/syscallargs.h> + +#include <machine/cpu.h> + +#include <vm/vm.h> +#include <sys/user.h> /* for coredump */ + +void stop __P((struct proc *p)); + +/* + * Can process p, with pcred pc, send the signal signum to process q? + */ +#define CANSIGNAL(p, pc, q, signum) \ + ((pc)->pc_ucred->cr_uid == 0 || \ + (pc)->p_ruid == (q)->p_cred->p_ruid || \ + (pc)->pc_ucred->cr_uid == (q)->p_cred->p_ruid || \ + (pc)->p_ruid == (q)->p_ucred->cr_uid || \ + (pc)->pc_ucred->cr_uid == (q)->p_ucred->cr_uid || \ + ((signum) == SIGCONT && (q)->p_session == (p)->p_session)) + +/* ARGSUSED */ +int +sigaction(p, uap, retval) + struct proc *p; + register struct sigaction_args /* { + syscallarg(int) signum; + syscallarg(struct sigaction *) nsa; + syscallarg(struct sigaction *) osa; + } */ *uap; + register_t *retval; +{ + struct sigaction vec; + register struct sigaction *sa; + register struct sigacts *ps = p->p_sigacts; + register int signum; + int bit, error; + + signum = SCARG(uap, signum); + if (signum <= 0 || signum >= NSIG || + signum == SIGKILL || signum == SIGSTOP) + return (EINVAL); + sa = &vec; + if (SCARG(uap, osa)) { + sa->sa_handler = ps->ps_sigact[signum]; + sa->sa_mask = ps->ps_catchmask[signum]; + bit = sigmask(signum); + sa->sa_flags = 0; + if ((ps->ps_sigonstack & bit) != 0) + sa->sa_flags |= SA_ONSTACK; + if ((ps->ps_sigintr & bit) == 0) + sa->sa_flags |= SA_RESTART; + if (p->p_flag & P_NOCLDSTOP) + sa->sa_flags |= SA_NOCLDSTOP; + if (error = copyout((caddr_t)sa, (caddr_t)SCARG(uap, osa), + sizeof (vec))) + return (error); + } + if (SCARG(uap, nsa)) { + if (error = copyin((caddr_t)SCARG(uap, nsa), (caddr_t)sa, + sizeof (vec))) + return (error); + setsigvec(p, signum, sa); + } + return (0); +} + +void +setsigvec(p, signum, sa) + register struct proc *p; + int signum; + register struct sigaction *sa; +{ + register struct sigacts *ps = p->p_sigacts; + register int bit; + + bit = sigmask(signum); + /* + * Change setting atomically. + */ + (void) splhigh(); + ps->ps_sigact[signum] = sa->sa_handler; + ps->ps_catchmask[signum] = sa->sa_mask &~ sigcantmask; + if ((sa->sa_flags & SA_RESTART) == 0) + ps->ps_sigintr |= bit; + else + ps->ps_sigintr &= ~bit; + if (sa->sa_flags & SA_ONSTACK) + ps->ps_sigonstack |= bit; + else + ps->ps_sigonstack &= ~bit; +#ifdef COMPAT_SUNOS + if (sa->sa_flags & SA_USERTRAMP) + ps->ps_usertramp |= bit; + else + ps->ps_usertramp &= ~bit; +#endif + if (signum == SIGCHLD) { + if (sa->sa_flags & SA_NOCLDSTOP) + p->p_flag |= P_NOCLDSTOP; + else + p->p_flag &= ~P_NOCLDSTOP; + } + /* + * Set bit in p_sigignore for signals that are set to SIG_IGN, + * and for signals set to SIG_DFL where the default is to ignore. + * However, don't put SIGCONT in p_sigignore, + * as we have to restart the process. + */ + if (sa->sa_handler == SIG_IGN || + (sigprop[signum] & SA_IGNORE && sa->sa_handler == SIG_DFL)) { + p->p_siglist &= ~bit; /* never to be seen again */ + if (signum != SIGCONT) + p->p_sigignore |= bit; /* easier in psignal */ + p->p_sigcatch &= ~bit; + } else { + p->p_sigignore &= ~bit; + if (sa->sa_handler == SIG_DFL) + p->p_sigcatch &= ~bit; + else + p->p_sigcatch |= bit; + } + (void) spl0(); +} + +/* + * Initialize signal state for process 0; + * set to ignore signals that are ignored by default. + */ +void +siginit(p) + struct proc *p; +{ + register int i; + + for (i = 0; i < NSIG; i++) + if (sigprop[i] & SA_IGNORE && i != SIGCONT) + p->p_sigignore |= sigmask(i); +} + +/* + * Reset signals for an exec of the specified process. + */ +void +execsigs(p) + register struct proc *p; +{ + register struct sigacts *ps = p->p_sigacts; + register int nc, mask; + + /* + * Reset caught signals. Held signals remain held + * through p_sigmask (unless they were caught, + * and are now ignored by default). + */ + while (p->p_sigcatch) { + nc = ffs((long)p->p_sigcatch); + mask = sigmask(nc); + p->p_sigcatch &= ~mask; + if (sigprop[nc] & SA_IGNORE) { + if (nc != SIGCONT) + p->p_sigignore |= mask; + p->p_siglist &= ~mask; + } + ps->ps_sigact[nc] = SIG_DFL; + } + /* + * Reset stack state to the user stack. + * Clear set of signals caught on the signal stack. + */ + ps->ps_sigstk.ss_flags = SA_DISABLE; + ps->ps_sigstk.ss_size = 0; + ps->ps_sigstk.ss_base = 0; + ps->ps_flags = 0; +} + +/* + * Manipulate signal mask. + * Note that we receive new mask, not pointer, + * and return old mask as return value; + * the library stub does the rest. + */ +int +sigprocmask(p, uap, retval) + register struct proc *p; + struct sigprocmask_args /* { + syscallarg(int) how; + syscallarg(sigset_t) mask; + } */ *uap; + register_t *retval; +{ + int error = 0; + + *retval = p->p_sigmask; + (void) splhigh(); + + switch (SCARG(uap, how)) { + case SIG_BLOCK: + p->p_sigmask |= SCARG(uap, mask) &~ sigcantmask; + break; + + case SIG_UNBLOCK: + p->p_sigmask &= ~SCARG(uap, mask); + break; + + case SIG_SETMASK: + p->p_sigmask = SCARG(uap, mask) &~ sigcantmask; + break; + + default: + error = EINVAL; + break; + } + (void) spl0(); + return (error); +} + +/* ARGSUSED */ +int +sigpending(p, uap, retval) + struct proc *p; + void *uap; + register_t *retval; +{ + + *retval = p->p_siglist; + return (0); +} + +#if defined(COMPAT_43) || defined(COMPAT_SUNOS) +/* + * Generalized interface signal handler, 4.3-compatible. + */ +/* ARGSUSED */ +int +compat_43_sigvec(p, uap, retval) + struct proc *p; + register struct compat_43_sigvec_args /* { + syscallarg(int) signum; + syscallarg(struct sigvec *) nsv; + syscallarg(struct sigvec *) osv; + } */ *uap; + register_t *retval; +{ + struct sigvec vec; + register struct sigacts *ps = p->p_sigacts; + register struct sigvec *sv; + register int signum; + int bit, error; + + signum = SCARG(uap, signum); + if (signum <= 0 || signum >= NSIG || + signum == SIGKILL || signum == SIGSTOP) + return (EINVAL); + sv = &vec; + if (SCARG(uap, osv)) { + *(sig_t *)&sv->sv_handler = ps->ps_sigact[signum]; + sv->sv_mask = ps->ps_catchmask[signum]; + bit = sigmask(signum); + sv->sv_flags = 0; + if ((ps->ps_sigonstack & bit) != 0) + sv->sv_flags |= SV_ONSTACK; + if ((ps->ps_sigintr & bit) != 0) + sv->sv_flags |= SV_INTERRUPT; +#ifndef COMPAT_SUNOS + if (p->p_flag & P_NOCLDSTOP) + sv->sv_flags |= SA_NOCLDSTOP; +#endif + if (error = copyout((caddr_t)sv, (caddr_t)SCARG(uap, osv), + sizeof (vec))) + return (error); + } + if (SCARG(uap, nsv)) { + if (error = copyin((caddr_t)SCARG(uap, nsv), (caddr_t)sv, + sizeof (vec))) + return (error); +#ifdef COMPAT_SUNOS + /* + * SunOS uses this bit (4, aka SA_DISABLE) as SV_RESETHAND, + * `reset to SIG_DFL on delivery'. We have no such option + * now or ever! + */ + if (sv->sv_flags & SA_DISABLE) + return (EINVAL); + sv->sv_flags |= SA_USERTRAMP; +#endif + sv->sv_flags ^= SA_RESTART; /* opposite of SV_INTERRUPT */ + setsigvec(p, signum, (struct sigaction *)sv); + } + return (0); +} + +int +compat_43_sigblock(p, uap, retval) + register struct proc *p; + struct compat_43_sigblock_args /* { + syscallarg(int) mask; + } */ *uap; + register_t *retval; +{ + + (void) splhigh(); + *retval = p->p_sigmask; + p->p_sigmask |= SCARG(uap, mask) &~ sigcantmask; + (void) spl0(); + return (0); +} + +int +compat_43_sigsetmask(p, uap, retval) + struct proc *p; + struct compat_43_sigsetmask_args /* { + syscallarg(int) mask; + } */ *uap; + register_t *retval; +{ + + (void) splhigh(); + *retval = p->p_sigmask; + p->p_sigmask = SCARG(uap, mask) &~ sigcantmask; + (void) spl0(); + return (0); +} +#endif /* COMPAT_43 || COMPAT_SUNOS */ + +/* + * Suspend process until signal, providing mask to be set + * in the meantime. Note nonstandard calling convention: + * libc stub passes mask, not pointer, to save a copyin. + */ +/* ARGSUSED */ +int +sigsuspend(p, uap, retval) + register struct proc *p; + struct sigsuspend_args /* { + syscallarg(int) mask; + } */ *uap; + register_t *retval; +{ + register struct sigacts *ps = p->p_sigacts; + + /* + * When returning from sigpause, we want + * the old mask to be restored after the + * signal handler has finished. Thus, we + * save it here and mark the sigacts structure + * to indicate this. + */ + ps->ps_oldmask = p->p_sigmask; + ps->ps_flags |= SAS_OLDMASK; + p->p_sigmask = SCARG(uap, mask) &~ sigcantmask; + while (tsleep((caddr_t) ps, PPAUSE|PCATCH, "pause", 0) == 0) + /* void */; + /* always return EINTR rather than ERESTART... */ + return (EINTR); +} + +#if defined(COMPAT_43) || defined(COMPAT_SUNOS) +/* ARGSUSED */ +int +compat_43_sigstack(p, uap, retval) + struct proc *p; + register struct compat_43_sigstack_args /* { + syscallarg(struct sigstack *) nss; + syscallarg(struct sigstack *) oss; + } */ *uap; + register_t *retval; +{ + struct sigstack ss; + struct sigacts *psp; + int error = 0; + + psp = p->p_sigacts; + ss.ss_sp = psp->ps_sigstk.ss_base; + ss.ss_onstack = psp->ps_sigstk.ss_flags & SA_ONSTACK; + if (SCARG(uap, oss) && (error = copyout((caddr_t)&ss, + (caddr_t)SCARG(uap, oss), sizeof (struct sigstack)))) + return (error); + if (SCARG(uap, nss) && (error = copyin((caddr_t)SCARG(uap, nss), + (caddr_t)&ss, sizeof (ss))) == 0) { + psp->ps_sigstk.ss_base = ss.ss_sp; + psp->ps_sigstk.ss_size = 0; + psp->ps_sigstk.ss_flags |= ss.ss_onstack & SA_ONSTACK; + psp->ps_flags |= SAS_ALTSTACK; + } + return (error); +} +#endif /* COMPAT_43 || COMPAT_SUNOS */ + +/* ARGSUSED */ +int +sigaltstack(p, uap, retval) + struct proc *p; + register struct sigaltstack_args /* { + syscallarg(struct sigaltstack *) nss; + syscallarg(struct sigaltstack *) oss; + } */ *uap; + register_t *retval; +{ + struct sigacts *psp; + struct sigaltstack ss; + int error; + + psp = p->p_sigacts; + if ((psp->ps_flags & SAS_ALTSTACK) == 0) + psp->ps_sigstk.ss_flags |= SA_DISABLE; + if (SCARG(uap, oss) && (error = copyout((caddr_t)&psp->ps_sigstk, + (caddr_t)SCARG(uap, oss), sizeof (struct sigaltstack)))) + return (error); + if (SCARG(uap, nss) == 0) + return (0); + if (error = copyin((caddr_t)SCARG(uap, nss), (caddr_t)&ss, + sizeof (ss))) + return (error); + if (ss.ss_flags & SA_DISABLE) { + if (psp->ps_sigstk.ss_flags & SA_ONSTACK) + return (EINVAL); + psp->ps_flags &= ~SAS_ALTSTACK; + psp->ps_sigstk.ss_flags = ss.ss_flags; + return (0); + } + if (ss.ss_size < MINSIGSTKSZ) + return (ENOMEM); + psp->ps_flags |= SAS_ALTSTACK; + psp->ps_sigstk= ss; + return (0); +} + +/* ARGSUSED */ +int +kill(cp, uap, retval) + register struct proc *cp; + register struct kill_args /* { + syscallarg(int) pid; + syscallarg(int) signum; + } */ *uap; + register_t *retval; +{ + register struct proc *p; + register struct pcred *pc = cp->p_cred; + + if ((u_int)SCARG(uap, signum) >= NSIG) + return (EINVAL); + if (SCARG(uap, pid) > 0) { + /* kill single process */ + if ((p = pfind(SCARG(uap, pid))) == NULL) + return (ESRCH); + if (!CANSIGNAL(cp, pc, p, SCARG(uap, signum))) + return (EPERM); + if (SCARG(uap, signum)) + psignal(p, SCARG(uap, signum)); + return (0); + } + switch (SCARG(uap, pid)) { + case -1: /* broadcast signal */ + return (killpg1(cp, SCARG(uap, signum), 0, 1)); + case 0: /* signal own process group */ + return (killpg1(cp, SCARG(uap, signum), 0, 0)); + default: /* negative explicit process group */ + return (killpg1(cp, SCARG(uap, signum), -SCARG(uap, pid), 0)); + } + /* NOTREACHED */ +} + +#if defined(COMPAT_43) || defined(COMPAT_SUNOS) +/* ARGSUSED */ +int +compat_43_killpg(p, uap, retval) + struct proc *p; + register struct compat_43_killpg_args /* { + syscallarg(int) pgid; + syscallarg(int) signum; + } */ *uap; + register_t *retval; +{ + + if ((u_int)SCARG(uap, signum) >= NSIG) + return (EINVAL); + return (killpg1(p, SCARG(uap, signum), SCARG(uap, pgid), 0)); +} +#endif /* COMPAT_43 || COMPAT_SUNOS */ + +/* + * Common code for kill process group/broadcast kill. + * cp is calling process. + */ +int +killpg1(cp, signum, pgid, all) + register struct proc *cp; + int signum, pgid, all; +{ + register struct proc *p; + register struct pcred *pc = cp->p_cred; + struct pgrp *pgrp; + int nfound = 0; + + if (all) + /* + * broadcast + */ + for (p = allproc.lh_first; p != 0; p = p->p_list.le_next) { + if (p->p_pid <= 1 || p->p_flag & P_SYSTEM || + p == cp || !CANSIGNAL(cp, pc, p, signum)) + continue; + nfound++; + if (signum) + psignal(p, signum); + } + else { + if (pgid == 0) + /* + * zero pgid means send to my process group. + */ + pgrp = cp->p_pgrp; + else { + pgrp = pgfind(pgid); + if (pgrp == NULL) + return (ESRCH); + } + for (p = pgrp->pg_members.lh_first; p != 0; + p = p->p_pglist.le_next) { + if (p->p_pid <= 1 || p->p_flag & P_SYSTEM || + p->p_stat == SZOMB || + !CANSIGNAL(cp, pc, p, signum)) + continue; + nfound++; + if (signum) + psignal(p, signum); + } + } + return (nfound ? 0 : ESRCH); +} + +/* + * Send a signal to a process group. + */ +void +gsignal(pgid, signum) + int pgid, signum; +{ + struct pgrp *pgrp; + + if (pgid && (pgrp = pgfind(pgid))) + pgsignal(pgrp, signum, 0); +} + +/* + * Send a signal to a process group. If checktty is 1, + * limit to members which have a controlling terminal. + */ +void +pgsignal(pgrp, signum, checkctty) + struct pgrp *pgrp; + int signum, checkctty; +{ + register struct proc *p; + + if (pgrp) + for (p = pgrp->pg_members.lh_first; p != 0; + p = p->p_pglist.le_next) + if (checkctty == 0 || p->p_flag & P_CONTROLT) + psignal(p, signum); +} + +/* + * Send a signal caused by a trap to the current process. + * If it will be caught immediately, deliver it with correct code. + * Otherwise, post it normally. + */ +void +trapsignal(p, signum, code) + struct proc *p; + register int signum; + u_long code; +{ + register struct sigacts *ps = p->p_sigacts; + int mask; + + mask = sigmask(signum); + if ((p->p_flag & P_TRACED) == 0 && (p->p_sigcatch & mask) != 0 && + (p->p_sigmask & mask) == 0) { + p->p_stats->p_ru.ru_nsignals++; +#ifdef KTRACE + if (KTRPOINT(p, KTR_PSIG)) + ktrpsig(p->p_tracep, signum, ps->ps_sigact[signum], + p->p_sigmask, code); +#endif + sendsig(ps->ps_sigact[signum], signum, p->p_sigmask, code); + p->p_sigmask |= ps->ps_catchmask[signum] | mask; + } else { + ps->ps_code = code; /* XXX for core dump/debugger */ + ps->ps_sig = signum; /* XXX to verify code */ + psignal(p, signum); + } +} + +/* + * Send the signal to the process. If the signal has an action, the action + * is usually performed by the target process rather than the caller; we add + * the signal to the set of pending signals for the process. + * + * Exceptions: + * o When a stop signal is sent to a sleeping process that takes the + * default action, the process is stopped without awakening it. + * o SIGCONT restarts stopped processes (or puts them back to sleep) + * regardless of the signal action (eg, blocked or ignored). + * + * Other ignored signals are discarded immediately. + */ +void +psignal(p, signum) + register struct proc *p; + register int signum; +{ + register int s, prop; + register sig_t action; + int mask; + + if ((u_int)signum >= NSIG || signum == 0) + panic("psignal signal number"); + mask = sigmask(signum); + prop = sigprop[signum]; + + /* + * If proc is traced, always give parent a chance. + */ + if (p->p_flag & P_TRACED) + action = SIG_DFL; + else { + /* + * If the signal is being ignored, + * then we forget about it immediately. + * (Note: we don't set SIGCONT in p_sigignore, + * and if it is set to SIG_IGN, + * action will be SIG_DFL here.) + */ + if (p->p_sigignore & mask) + return; + if (p->p_sigmask & mask) + action = SIG_HOLD; + else if (p->p_sigcatch & mask) + action = SIG_CATCH; + else + action = SIG_DFL; + } + + if (p->p_nice > NZERO && action == SIG_DFL && (prop & SA_KILL) && + (p->p_flag & P_TRACED) == 0) + p->p_nice = NZERO; + + if (prop & SA_CONT) + p->p_siglist &= ~stopsigmask; + + if (prop & SA_STOP) { + /* + * If sending a tty stop signal to a member of an orphaned + * process group, discard the signal here if the action + * is default; don't stop the process below if sleeping, + * and don't clear any pending SIGCONT. + */ + if (prop & SA_TTYSTOP && p->p_pgrp->pg_jobc == 0 && + action == SIG_DFL) + return; + p->p_siglist &= ~contsigmask; + } + p->p_siglist |= mask; + + /* + * Defer further processing for signals which are held, + * except that stopped processes must be continued by SIGCONT. + */ + if (action == SIG_HOLD && ((prop & SA_CONT) == 0 || p->p_stat != SSTOP)) + return; + s = splhigh(); + switch (p->p_stat) { + + case SSLEEP: + /* + * If process is sleeping uninterruptibly + * we can't interrupt the sleep... the signal will + * be noticed when the process returns through + * trap() or syscall(). + */ + if ((p->p_flag & P_SINTR) == 0) + goto out; + /* + * Process is sleeping and traced... make it runnable + * so it can discover the signal in issignal() and stop + * for the parent. + */ + if (p->p_flag & P_TRACED) + goto run; + /* + * If SIGCONT is default (or ignored) and process is + * asleep, we are finished; the process should not + * be awakened. + */ + if ((prop & SA_CONT) && action == SIG_DFL) { + p->p_siglist &= ~mask; + goto out; + } + /* + * When a sleeping process receives a stop + * signal, process immediately if possible. + * All other (caught or default) signals + * cause the process to run. + */ + if (prop & SA_STOP) { + if (action != SIG_DFL) + goto runfast; + /* + * If a child holding parent blocked, + * stopping could cause deadlock. + */ + if (p->p_flag & P_PPWAIT) + goto out; + p->p_siglist &= ~mask; + p->p_xstat = signum; + if ((p->p_pptr->p_flag & P_NOCLDSTOP) == 0) + psignal(p->p_pptr, SIGCHLD); + stop(p); + goto out; + } else + goto runfast; + /*NOTREACHED*/ + + case SSTOP: + /* + * If traced process is already stopped, + * then no further action is necessary. + */ + if (p->p_flag & P_TRACED) + goto out; + + /* + * Kill signal always sets processes running. + */ + if (signum == SIGKILL) + goto runfast; + + if (prop & SA_CONT) { + /* + * If SIGCONT is default (or ignored), we continue the + * process but don't leave the signal in p_siglist, as + * it has no further action. If SIGCONT is held, we + * continue the process and leave the signal in + * p_siglist. If the process catches SIGCONT, let it + * handle the signal itself. If it isn't waiting on + * an event, then it goes back to run state. + * Otherwise, process goes back to sleep state. + */ + if (action == SIG_DFL) + p->p_siglist &= ~mask; + if (action == SIG_CATCH) + goto runfast; + if (p->p_wchan == 0) + goto run; + p->p_stat = SSLEEP; + goto out; + } + + if (prop & SA_STOP) { + /* + * Already stopped, don't need to stop again. + * (If we did the shell could get confused.) + */ + p->p_siglist &= ~mask; /* take it away */ + goto out; + } + + /* + * If process is sleeping interruptibly, then simulate a + * wakeup so that when it is continued, it will be made + * runnable and can look at the signal. But don't make + * the process runnable, leave it stopped. + */ + if (p->p_wchan && p->p_flag & P_SINTR) + unsleep(p); + goto out; + + default: + /* + * SRUN, SIDL, SZOMB do nothing with the signal, + * other than kicking ourselves if we are running. + * It will either never be noticed, or noticed very soon. + */ + if (p == curproc) + signotify(p); + goto out; + } + /*NOTREACHED*/ + +runfast: + /* + * Raise priority to at least PUSER. + */ + if (p->p_priority > PUSER) + p->p_priority = PUSER; +run: + setrunnable(p); +out: + splx(s); +} + +/* + * If the current process has received a signal (should be caught or cause + * termination, should interrupt current syscall), return the signal number. + * Stop signals with default action are processed immediately, then cleared; + * they aren't returned. This is checked after each entry to the system for + * a syscall or trap (though this can usually be done without calling issignal + * by checking the pending signal masks in the CURSIG macro.) The normal call + * sequence is + * + * while (signum = CURSIG(curproc)) + * postsig(signum); + */ +int +issignal(p) + register struct proc *p; +{ + register int signum, mask, prop; + + for (;;) { + mask = p->p_siglist & ~p->p_sigmask; + if (p->p_flag & P_PPWAIT) + mask &= ~stopsigmask; + if (mask == 0) /* no signal to send */ + return (0); + signum = ffs((long)mask); + mask = sigmask(signum); + prop = sigprop[signum]; + /* + * We should see pending but ignored signals + * only if P_TRACED was on when they were posted. + */ + if (mask & p->p_sigignore && (p->p_flag & P_TRACED) == 0) { + p->p_siglist &= ~mask; + continue; + } + if (p->p_flag & P_TRACED && (p->p_flag & P_PPWAIT) == 0) { + /* + * If traced, always stop, and stay + * stopped until released by the parent. + * + * Note that we must clear the pending signal + * before we call trace_req since that routine + * might cause a fault, calling tsleep and + * leading us back here again with the same signal. + * Then we would be deadlocked because the tracer + * would still be blocked on the ipc struct from + * the initial request. + */ + p->p_xstat = signum; + p->p_siglist &= ~mask; + psignal(p->p_pptr, SIGCHLD); + do { + stop(p); + mi_switch(); + } while (!trace_req(p) && p->p_flag & P_TRACED); + + /* + * If parent wants us to take the signal, + * then it will leave it in p->p_xstat; + * otherwise we just look for signals again. + */ + signum = p->p_xstat; + if (signum == 0) + continue; + + /* + * Put the new signal into p_siglist. If the + * signal is being masked, look for other signals. + */ + mask = sigmask(signum); + p->p_siglist |= mask; + if (p->p_sigmask & mask) + continue; + + /* + * If the traced bit got turned off, go back up + * to the top to rescan signals. This ensures + * that p_sig* and ps_sigact are consistent. + */ + if ((p->p_flag & P_TRACED) == 0) + continue; + } + + /* + * Decide whether the signal should be returned. + * Return the signal's number, or fall through + * to clear it from the pending mask. + */ + switch ((long)p->p_sigacts->ps_sigact[signum]) { + + case (long)SIG_DFL: + /* + * Don't take default actions on system processes. + */ + if (p->p_pid <= 1) { +#ifdef DIAGNOSTIC + /* + * Are you sure you want to ignore SIGSEGV + * in init? XXX + */ + printf("Process (pid %d) got signal %d\n", + p->p_pid, signum); +#endif + break; /* == ignore */ + } + /* + * If there is a pending stop signal to process + * with default action, stop here, + * then clear the signal. However, + * if process is member of an orphaned + * process group, ignore tty stop signals. + */ + if (prop & SA_STOP) { + if (p->p_flag & P_TRACED || + (p->p_pgrp->pg_jobc == 0 && + prop & SA_TTYSTOP)) + break; /* == ignore */ + p->p_xstat = signum; + stop(p); + if ((p->p_pptr->p_flag & P_NOCLDSTOP) == 0) + psignal(p->p_pptr, SIGCHLD); + mi_switch(); + break; + } else if (prop & SA_IGNORE) { + /* + * Except for SIGCONT, shouldn't get here. + * Default action is to ignore; drop it. + */ + break; /* == ignore */ + } else + return (signum); + /*NOTREACHED*/ + + case (long)SIG_IGN: + /* + * Masking above should prevent us ever trying + * to take action on an ignored signal other + * than SIGCONT, unless process is traced. + */ + if ((prop & SA_CONT) == 0 && + (p->p_flag & P_TRACED) == 0) + printf("issignal\n"); + break; /* == ignore */ + + default: + /* + * This signal has an action, let + * postsig() process it. + */ + return (signum); + } + p->p_siglist &= ~mask; /* take the signal! */ + } + /* NOTREACHED */ +} + +/* + * Put the argument process into the stopped state and notify the parent + * via wakeup. Signals are handled elsewhere. The process must not be + * on the run queue. + */ +void +stop(p) + register struct proc *p; +{ + + p->p_stat = SSTOP; + p->p_flag &= ~P_WAITED; + wakeup((caddr_t)p->p_pptr); +} + +/* + * Take the action for the specified signal + * from the current set of pending signals. + */ +void +postsig(signum) + register int signum; +{ + register struct proc *p = curproc; + register struct sigacts *ps = p->p_sigacts; + register sig_t action; + u_long code; + int mask, returnmask; + +#ifdef DIAGNOSTIC + if (signum == 0) + panic("postsig"); +#endif + mask = sigmask(signum); + p->p_siglist &= ~mask; + action = ps->ps_sigact[signum]; +#ifdef KTRACE + if (KTRPOINT(p, KTR_PSIG)) + ktrpsig(p->p_tracep, + signum, action, ps->ps_flags & SAS_OLDMASK ? + ps->ps_oldmask : p->p_sigmask, 0); +#endif + if (action == SIG_DFL) { + /* + * Default action, where the default is to kill + * the process. (Other cases were ignored above.) + */ + sigexit(p, signum); + /* NOTREACHED */ + } else { + /* + * If we get here, the signal must be caught. + */ +#ifdef DIAGNOSTIC + if (action == SIG_IGN || (p->p_sigmask & mask)) + panic("postsig action"); +#endif + /* + * Set the new mask value and also defer further + * occurences of this signal. + * + * Special case: user has done a sigpause. Here the + * current mask is not of interest, but rather the + * mask from before the sigpause is what we want + * restored after the signal processing is completed. + */ + (void) splhigh(); + if (ps->ps_flags & SAS_OLDMASK) { + returnmask = ps->ps_oldmask; + ps->ps_flags &= ~SAS_OLDMASK; + } else + returnmask = p->p_sigmask; + p->p_sigmask |= ps->ps_catchmask[signum] | mask; + (void) spl0(); + p->p_stats->p_ru.ru_nsignals++; + if (ps->ps_sig != signum) { + code = 0; + } else { + code = ps->ps_code; + ps->ps_code = 0; + ps->ps_sig = 0; + } + sendsig(action, signum, returnmask, code); + } +} + +/* + * Kill the current process for stated reason. + */ +void +killproc(p, why) + struct proc *p; + char *why; +{ + + log(LOG_ERR, "pid %d was killed: %s\n", p->p_pid, why); + uprintf("sorry, pid %d was killed: %s\n", p->p_pid, why); + psignal(p, SIGKILL); +} + +/* + * Force the current process to exit with the specified signal, dumping core + * if appropriate. We bypass the normal tests for masked and caught signals, + * allowing unrecoverable failures to terminate the process without changing + * signal state. Mark the accounting record with the signal termination. + * If dumping core, save the signal number for the debugger. Calls exit and + * does not return. + */ +void +sigexit(p, signum) + register struct proc *p; + int signum; +{ + + p->p_acflag |= AXSIG; + if (sigprop[signum] & SA_CORE) { + p->p_sigacts->ps_sig = signum; + if (coredump(p) == 0) + signum |= WCOREFLAG; + } + exit1(p, W_EXITCODE(0, signum)); + /* NOTREACHED */ +} + +/* + * Dump core, into a file named "progname.core", unless the process was + * setuid/setgid. + */ +int +coredump(p) + register struct proc *p; +{ + register struct vnode *vp; + register struct pcred *pcred = p->p_cred; + register struct ucred *cred = pcred->pc_ucred; + register struct vmspace *vm = p->p_vmspace; + struct nameidata nd; + struct vattr vattr; + int error, error1; + char name[MAXCOMLEN+6]; /* progname.core */ + + if (pcred->p_svuid != pcred->p_ruid || pcred->p_svgid != pcred->p_rgid) + return (EFAULT); + if (ctob(UPAGES + vm->vm_dsize + vm->vm_ssize) >= + p->p_rlimit[RLIMIT_CORE].rlim_cur) + return (EFAULT); + sprintf(name, "%s.core", p->p_comm); + NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, name, p); + if (error = vn_open(&nd, + O_CREAT | FWRITE, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH)) + return (error); + vp = nd.ni_vp; + + /* Don't dump to non-regular files or files with links. */ + if (vp->v_type != VREG || + VOP_GETATTR(vp, &vattr, cred, p) || vattr.va_nlink != 1) { + error = EFAULT; + goto out; + } + VATTR_NULL(&vattr); + vattr.va_size = 0; + VOP_LEASE(vp, p, cred, LEASE_WRITE); + VOP_SETATTR(vp, &vattr, cred, p); + p->p_acflag |= ACORE; + bcopy(p, &p->p_addr->u_kproc.kp_proc, sizeof(struct proc)); + fill_eproc(p, &p->p_addr->u_kproc.kp_eproc); + error = cpu_coredump(p, vp, cred); + if (error == 0) + error = vn_rdwr(UIO_WRITE, vp, vm->vm_daddr, + (int)ctob(vm->vm_dsize), (off_t)ctob(UPAGES), UIO_USERSPACE, + IO_NODELOCKED|IO_UNIT, cred, (int *) NULL, p); + if (error == 0) + error = vn_rdwr(UIO_WRITE, vp, + (caddr_t) trunc_page(USRSTACK - ctob(vm->vm_ssize)), + round_page(ctob(vm->vm_ssize)), + (off_t)ctob(UPAGES) + ctob(vm->vm_dsize), UIO_USERSPACE, + IO_NODELOCKED|IO_UNIT, cred, (int *) NULL, p); +out: + VOP_UNLOCK(vp, 0, p); + error1 = vn_close(vp, FWRITE, cred, p); + if (error == 0) + error = error1; + return (error); +} + +/* + * Nonexistent system call-- signal process (may want to handle it). + * Flag error in case process won't see signal immediately (blocked or ignored). + */ +/* ARGSUSED */ +int +nosys(p, args, retval) + struct proc *p; + void *args; + register_t *retval; +{ + + psignal(p, SIGSYS); + return (ENOSYS); +} diff --git a/sys/kern/kern_subr.c b/sys/kern/kern_subr.c new file mode 100644 index 000000000000..df8371077adf --- /dev/null +++ b/sys/kern/kern_subr.c @@ -0,0 +1,215 @@ +/* + * Copyright (c) 1982, 1986, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)kern_subr.c 8.4 (Berkeley) 2/14/95 + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/proc.h> +#include <sys/malloc.h> +#include <sys/queue.h> + +int +uiomove(cp, n, uio) + register caddr_t cp; + register int n; + register struct uio *uio; +{ + register struct iovec *iov; + u_int cnt; + int error = 0; + +#ifdef DIAGNOSTIC + if (uio->uio_rw != UIO_READ && uio->uio_rw != UIO_WRITE) + panic("uiomove: mode"); + if (uio->uio_segflg == UIO_USERSPACE && uio->uio_procp != curproc) + panic("uiomove proc"); +#endif + while (n > 0 && uio->uio_resid) { + iov = uio->uio_iov; + cnt = iov->iov_len; + if (cnt == 0) { + uio->uio_iov++; + uio->uio_iovcnt--; + continue; + } + if (cnt > n) + cnt = n; + switch (uio->uio_segflg) { + + case UIO_USERSPACE: + case UIO_USERISPACE: + if (uio->uio_rw == UIO_READ) + error = copyout(cp, iov->iov_base, cnt); + else + error = copyin(iov->iov_base, cp, cnt); + if (error) + return (error); + break; + + case UIO_SYSSPACE: + if (uio->uio_rw == UIO_READ) + bcopy((caddr_t)cp, iov->iov_base, cnt); + else + bcopy(iov->iov_base, (caddr_t)cp, cnt); + break; + } + iov->iov_base += cnt; + iov->iov_len -= cnt; + uio->uio_resid -= cnt; + uio->uio_offset += cnt; + cp += cnt; + n -= cnt; + } + return (error); +} + +/* + * Give next character to user as result of read. + */ +int +ureadc(c, uio) + register int c; + register struct uio *uio; +{ + register struct iovec *iov; + + if (uio->uio_resid <= 0) + panic("ureadc: non-positive resid"); +again: + if (uio->uio_iovcnt <= 0) + panic("ureadc: non-positive iovcnt"); + iov = uio->uio_iov; + if (iov->iov_len <= 0) { + uio->uio_iovcnt--; + uio->uio_iov++; + goto again; + } + switch (uio->uio_segflg) { + + case UIO_USERSPACE: + if (subyte(iov->iov_base, c) < 0) + return (EFAULT); + break; + + case UIO_SYSSPACE: + *iov->iov_base = c; + break; + + case UIO_USERISPACE: + if (suibyte(iov->iov_base, c) < 0) + return (EFAULT); + break; + } + iov->iov_base++; + iov->iov_len--; + uio->uio_resid--; + uio->uio_offset++; + return (0); +} + +#ifdef vax /* unused except by ct.c, other oddities XXX */ +/* + * Get next character written in by user from uio. + */ +int +uwritec(uio) + struct uio *uio; +{ + register struct iovec *iov; + register int c; + + if (uio->uio_resid <= 0) + return (-1); +again: + if (uio->uio_iovcnt <= 0) + panic("uwritec: non-positive iovcnt"); + iov = uio->uio_iov; + if (iov->iov_len == 0) { + uio->uio_iov++; + if (--uio->uio_iovcnt == 0) + return (-1); + goto again; + } + switch (uio->uio_segflg) { + + case UIO_USERSPACE: + c = fubyte(iov->iov_base); + break; + + case UIO_SYSSPACE: + c = *(u_char *) iov->iov_base; + break; + + case UIO_USERISPACE: + c = fuibyte(iov->iov_base); + break; + } + if (c < 0) + return (-1); + iov->iov_base++; + iov->iov_len--; + uio->uio_resid--; + uio->uio_offset++; + return (c); +} +#endif /* vax */ + +/* + * General routine to allocate a hash table. + */ +void * +hashinit(elements, type, hashmask) + int elements, type; + u_long *hashmask; +{ + long hashsize; + LIST_HEAD(generic, generic) *hashtbl; + int i; + + if (elements <= 0) + panic("hashinit: bad cnt"); + for (hashsize = 1; hashsize <= elements; hashsize <<= 1) + continue; + hashsize >>= 1; + hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl), type, M_WAITOK); + for (i = 0; i < hashsize; i++) + LIST_INIT(&hashtbl[i]); + *hashmask = hashsize - 1; + return (hashtbl); +} diff --git a/sys/kern/kern_synch.c b/sys/kern/kern_synch.c new file mode 100644 index 000000000000..6c8202731f69 --- /dev/null +++ b/sys/kern/kern_synch.c @@ -0,0 +1,671 @@ +/*- + * Copyright (c) 1982, 1986, 1990, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)kern_synch.c 8.9 (Berkeley) 5/19/95 + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/proc.h> +#include <sys/kernel.h> +#include <sys/buf.h> +#include <sys/signalvar.h> +#include <sys/resourcevar.h> +#include <sys/vmmeter.h> +#ifdef KTRACE +#include <sys/ktrace.h> +#endif + +#include <machine/cpu.h> + +u_char curpriority; /* usrpri of curproc */ +int lbolt; /* once a second sleep address */ + +/* + * Force switch among equal priority processes every 100ms. + */ +/* ARGSUSED */ +void +roundrobin(arg) + void *arg; +{ + + need_resched(); + timeout(roundrobin, NULL, hz / 10); +} + +/* + * Constants for digital decay and forget: + * 90% of (p_estcpu) usage in 5 * loadav time + * 95% of (p_pctcpu) usage in 60 seconds (load insensitive) + * Note that, as ps(1) mentions, this can let percentages + * total over 100% (I've seen 137.9% for 3 processes). + * + * Note that hardclock updates p_estcpu and p_cpticks independently. + * + * We wish to decay away 90% of p_estcpu in (5 * loadavg) seconds. + * That is, the system wants to compute a value of decay such + * that the following for loop: + * for (i = 0; i < (5 * loadavg); i++) + * p_estcpu *= decay; + * will compute + * p_estcpu *= 0.1; + * for all values of loadavg: + * + * Mathematically this loop can be expressed by saying: + * decay ** (5 * loadavg) ~= .1 + * + * The system computes decay as: + * decay = (2 * loadavg) / (2 * loadavg + 1) + * + * We wish to prove that the system's computation of decay + * will always fulfill the equation: + * decay ** (5 * loadavg) ~= .1 + * + * If we compute b as: + * b = 2 * loadavg + * then + * decay = b / (b + 1) + * + * We now need to prove two things: + * 1) Given factor ** (5 * loadavg) ~= .1, prove factor == b/(b+1) + * 2) Given b/(b+1) ** power ~= .1, prove power == (5 * loadavg) + * + * Facts: + * For x close to zero, exp(x) =~ 1 + x, since + * exp(x) = 0! + x**1/1! + x**2/2! + ... . + * therefore exp(-1/b) =~ 1 - (1/b) = (b-1)/b. + * For x close to zero, ln(1+x) =~ x, since + * ln(1+x) = x - x**2/2 + x**3/3 - ... -1 < x < 1 + * therefore ln(b/(b+1)) = ln(1 - 1/(b+1)) =~ -1/(b+1). + * ln(.1) =~ -2.30 + * + * Proof of (1): + * Solve (factor)**(power) =~ .1 given power (5*loadav): + * solving for factor, + * ln(factor) =~ (-2.30/5*loadav), or + * factor =~ exp(-1/((5/2.30)*loadav)) =~ exp(-1/(2*loadav)) = + * exp(-1/b) =~ (b-1)/b =~ b/(b+1). QED + * + * Proof of (2): + * Solve (factor)**(power) =~ .1 given factor == (b/(b+1)): + * solving for power, + * power*ln(b/(b+1)) =~ -2.30, or + * power =~ 2.3 * (b + 1) = 4.6*loadav + 2.3 =~ 5*loadav. QED + * + * Actual power values for the implemented algorithm are as follows: + * loadav: 1 2 3 4 + * power: 5.68 10.32 14.94 19.55 + */ + +/* calculations for digital decay to forget 90% of usage in 5*loadav sec */ +#define loadfactor(loadav) (2 * (loadav)) +#define decay_cpu(loadfac, cpu) (((loadfac) * (cpu)) / ((loadfac) + FSCALE)) + +/* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */ +fixpt_t ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */ + +/* + * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the + * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below + * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT). + * + * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used: + * 1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits). + * + * If you dont want to bother with the faster/more-accurate formula, you + * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate + * (more general) method of calculating the %age of CPU used by a process. + */ +#define CCPU_SHIFT 11 + +/* + * Recompute process priorities, every hz ticks. + */ +/* ARGSUSED */ +void +schedcpu(arg) + void *arg; +{ + register fixpt_t loadfac = loadfactor(averunnable.ldavg[0]); + register struct proc *p; + register int s; + register unsigned int newcpu; + + wakeup((caddr_t)&lbolt); + for (p = allproc.lh_first; p != 0; p = p->p_list.le_next) { + /* + * Increment time in/out of memory and sleep time + * (if sleeping). We ignore overflow; with 16-bit int's + * (remember them?) overflow takes 45 days. + */ + p->p_swtime++; + if (p->p_stat == SSLEEP || p->p_stat == SSTOP) + p->p_slptime++; + p->p_pctcpu = (p->p_pctcpu * ccpu) >> FSHIFT; + /* + * If the process has slept the entire second, + * stop recalculating its priority until it wakes up. + */ + if (p->p_slptime > 1) + continue; + s = splstatclock(); /* prevent state changes */ + /* + * p_pctcpu is only for ps. + */ +#if (FSHIFT >= CCPU_SHIFT) + p->p_pctcpu += (hz == 100)? + ((fixpt_t) p->p_cpticks) << (FSHIFT - CCPU_SHIFT): + 100 * (((fixpt_t) p->p_cpticks) + << (FSHIFT - CCPU_SHIFT)) / hz; +#else + p->p_pctcpu += ((FSCALE - ccpu) * + (p->p_cpticks * FSCALE / hz)) >> FSHIFT; +#endif + p->p_cpticks = 0; + newcpu = (u_int) decay_cpu(loadfac, p->p_estcpu) + p->p_nice; + p->p_estcpu = min(newcpu, UCHAR_MAX); + resetpriority(p); + if (p->p_priority >= PUSER) { +#define PPQ (128 / NQS) /* priorities per queue */ + if ((p != curproc) && + p->p_stat == SRUN && + (p->p_flag & P_INMEM) && + (p->p_priority / PPQ) != (p->p_usrpri / PPQ)) { + remrq(p); + p->p_priority = p->p_usrpri; + setrunqueue(p); + } else + p->p_priority = p->p_usrpri; + } + splx(s); + } + vmmeter(); + if (bclnlist != NULL) + wakeup((caddr_t)pageproc); + timeout(schedcpu, (void *)0, hz); +} + +/* + * Recalculate the priority of a process after it has slept for a while. + * For all load averages >= 1 and max p_estcpu of 255, sleeping for at + * least six times the loadfactor will decay p_estcpu to zero. + */ +void +updatepri(p) + register struct proc *p; +{ + register unsigned int newcpu = p->p_estcpu; + register fixpt_t loadfac = loadfactor(averunnable.ldavg[0]); + + if (p->p_slptime > 5 * loadfac) + p->p_estcpu = 0; + else { + p->p_slptime--; /* the first time was done in schedcpu */ + while (newcpu && --p->p_slptime) + newcpu = (int) decay_cpu(loadfac, newcpu); + p->p_estcpu = min(newcpu, UCHAR_MAX); + } + resetpriority(p); +} + +/* + * We're only looking at 7 bits of the address; everything is + * aligned to 4, lots of things are aligned to greater powers + * of 2. Shift right by 8, i.e. drop the bottom 256 worth. + */ +#define TABLESIZE 128 +#define LOOKUP(x) (((long)(x) >> 8) & (TABLESIZE - 1)) +struct slpque { + struct proc *sq_head; + struct proc **sq_tailp; +} slpque[TABLESIZE]; + +/* + * During autoconfiguration or after a panic, a sleep will simply + * lower the priority briefly to allow interrupts, then return. + * The priority to be used (safepri) is machine-dependent, thus this + * value is initialized and maintained in the machine-dependent layers. + * This priority will typically be 0, or the lowest priority + * that is safe for use on the interrupt stack; it can be made + * higher to block network software interrupts after panics. + */ +int safepri; + +/* + * General sleep call. Suspends the current process until a wakeup is + * performed on the specified identifier. The process will then be made + * runnable with the specified priority. Sleeps at most timo/hz seconds + * (0 means no timeout). If pri includes PCATCH flag, signals are checked + * before and after sleeping, else signals are not checked. Returns 0 if + * awakened, EWOULDBLOCK if the timeout expires. If PCATCH is set and a + * signal needs to be delivered, ERESTART is returned if the current system + * call should be restarted if possible, and EINTR is returned if the system + * call should be interrupted by the signal (return EINTR). + */ +int +tsleep(ident, priority, wmesg, timo) + void *ident; + int priority, timo; + char *wmesg; +{ + register struct proc *p = curproc; + register struct slpque *qp; + register s; + int sig, catch = priority & PCATCH; + extern int cold; + void endtsleep __P((void *)); + +#ifdef KTRACE + if (KTRPOINT(p, KTR_CSW)) + ktrcsw(p->p_tracep, 1, 0); +#endif + s = splhigh(); + if (cold || panicstr) { + /* + * After a panic, or during autoconfiguration, + * just give interrupts a chance, then just return; + * don't run any other procs or panic below, + * in case this is the idle process and already asleep. + */ + splx(safepri); + splx(s); + return (0); + } +#ifdef DIAGNOSTIC + if (ident == NULL || p->p_stat != SRUN || p->p_back) + panic("tsleep"); +#endif + p->p_wchan = ident; + p->p_wmesg = wmesg; + p->p_slptime = 0; + p->p_priority = priority & PRIMASK; + qp = &slpque[LOOKUP(ident)]; + if (qp->sq_head == 0) + qp->sq_head = p; + else + *qp->sq_tailp = p; + *(qp->sq_tailp = &p->p_forw) = 0; + if (timo) + timeout(endtsleep, (void *)p, timo); + /* + * We put ourselves on the sleep queue and start our timeout + * before calling CURSIG, as we could stop there, and a wakeup + * or a SIGCONT (or both) could occur while we were stopped. + * A SIGCONT would cause us to be marked as SSLEEP + * without resuming us, thus we must be ready for sleep + * when CURSIG is called. If the wakeup happens while we're + * stopped, p->p_wchan will be 0 upon return from CURSIG. + */ + if (catch) { + p->p_flag |= P_SINTR; + if (sig = CURSIG(p)) { + if (p->p_wchan) + unsleep(p); + p->p_stat = SRUN; + goto resume; + } + if (p->p_wchan == 0) { + catch = 0; + goto resume; + } + } else + sig = 0; + p->p_stat = SSLEEP; + p->p_stats->p_ru.ru_nvcsw++; + mi_switch(); +resume: + curpriority = p->p_usrpri; + splx(s); + p->p_flag &= ~P_SINTR; + if (p->p_flag & P_TIMEOUT) { + p->p_flag &= ~P_TIMEOUT; + if (sig == 0) { +#ifdef KTRACE + if (KTRPOINT(p, KTR_CSW)) + ktrcsw(p->p_tracep, 0, 0); +#endif + return (EWOULDBLOCK); + } + } else if (timo) + untimeout(endtsleep, (void *)p); + if (catch && (sig != 0 || (sig = CURSIG(p)))) { +#ifdef KTRACE + if (KTRPOINT(p, KTR_CSW)) + ktrcsw(p->p_tracep, 0, 0); +#endif + if (p->p_sigacts->ps_sigintr & sigmask(sig)) + return (EINTR); + return (ERESTART); + } +#ifdef KTRACE + if (KTRPOINT(p, KTR_CSW)) + ktrcsw(p->p_tracep, 0, 0); +#endif + return (0); +} + +/* + * Implement timeout for tsleep. + * If process hasn't been awakened (wchan non-zero), + * set timeout flag and undo the sleep. If proc + * is stopped, just unsleep so it will remain stopped. + */ +void +endtsleep(arg) + void *arg; +{ + register struct proc *p; + int s; + + p = (struct proc *)arg; + s = splhigh(); + if (p->p_wchan) { + if (p->p_stat == SSLEEP) + setrunnable(p); + else + unsleep(p); + p->p_flag |= P_TIMEOUT; + } + splx(s); +} + +/* + * Short-term, non-interruptable sleep. + */ +void +sleep(ident, priority) + void *ident; + int priority; +{ + register struct proc *p = curproc; + register struct slpque *qp; + register s; + extern int cold; + +#ifdef DIAGNOSTIC + if (priority > PZERO) { + printf("sleep called with priority %d > PZERO, wchan: %x\n", + priority, ident); + panic("old sleep"); + } +#endif + s = splhigh(); + if (cold || panicstr) { + /* + * After a panic, or during autoconfiguration, + * just give interrupts a chance, then just return; + * don't run any other procs or panic below, + * in case this is the idle process and already asleep. + */ + splx(safepri); + splx(s); + return; + } +#ifdef DIAGNOSTIC + if (ident == NULL || p->p_stat != SRUN || p->p_back) + panic("sleep"); +#endif + p->p_wchan = ident; + p->p_wmesg = NULL; + p->p_slptime = 0; + p->p_priority = priority; + qp = &slpque[LOOKUP(ident)]; + if (qp->sq_head == 0) + qp->sq_head = p; + else + *qp->sq_tailp = p; + *(qp->sq_tailp = &p->p_forw) = 0; + p->p_stat = SSLEEP; + p->p_stats->p_ru.ru_nvcsw++; +#ifdef KTRACE + if (KTRPOINT(p, KTR_CSW)) + ktrcsw(p->p_tracep, 1, 0); +#endif + mi_switch(); +#ifdef KTRACE + if (KTRPOINT(p, KTR_CSW)) + ktrcsw(p->p_tracep, 0, 0); +#endif + curpriority = p->p_usrpri; + splx(s); +} + +/* + * Remove a process from its wait queue + */ +void +unsleep(p) + register struct proc *p; +{ + register struct slpque *qp; + register struct proc **hp; + int s; + + s = splhigh(); + if (p->p_wchan) { + hp = &(qp = &slpque[LOOKUP(p->p_wchan)])->sq_head; + while (*hp != p) + hp = &(*hp)->p_forw; + *hp = p->p_forw; + if (qp->sq_tailp == &p->p_forw) + qp->sq_tailp = hp; + p->p_wchan = 0; + } + splx(s); +} + +/* + * Make all processes sleeping on the specified identifier runnable. + */ +void +wakeup(ident) + register void *ident; +{ + register struct slpque *qp; + register struct proc *p, **q; + int s; + + s = splhigh(); + qp = &slpque[LOOKUP(ident)]; +restart: + for (q = &qp->sq_head; p = *q; ) { +#ifdef DIAGNOSTIC + if (p->p_back || p->p_stat != SSLEEP && p->p_stat != SSTOP) + panic("wakeup"); +#endif + if (p->p_wchan == ident) { + p->p_wchan = 0; + *q = p->p_forw; + if (qp->sq_tailp == &p->p_forw) + qp->sq_tailp = q; + if (p->p_stat == SSLEEP) { + /* OPTIMIZED EXPANSION OF setrunnable(p); */ + if (p->p_slptime > 1) + updatepri(p); + p->p_slptime = 0; + p->p_stat = SRUN; + if (p->p_flag & P_INMEM) + setrunqueue(p); + /* + * Since curpriority is a user priority, + * p->p_priority is always better than + * curpriority. + */ + if ((p->p_flag & P_INMEM) == 0) + wakeup((caddr_t)&proc0); + else + need_resched(); + /* END INLINE EXPANSION */ + goto restart; + } + } else + q = &p->p_forw; + } + splx(s); +} + +/* + * The machine independent parts of mi_switch(). + * Must be called at splstatclock() or higher. + */ +void +mi_switch() +{ + register struct proc *p = curproc; /* XXX */ + register struct rlimit *rlim; + register long s, u; + struct timeval tv; + +#ifdef DEBUG + if (p->p_simple_locks) + panic("sleep: holding simple lock"); +#endif + /* + * Compute the amount of time during which the current + * process was running, and add that to its total so far. + */ + microtime(&tv); + u = p->p_rtime.tv_usec + (tv.tv_usec - runtime.tv_usec); + s = p->p_rtime.tv_sec + (tv.tv_sec - runtime.tv_sec); + if (u < 0) { + u += 1000000; + s--; + } else if (u >= 1000000) { + u -= 1000000; + s++; + } + p->p_rtime.tv_usec = u; + p->p_rtime.tv_sec = s; + + /* + * Check if the process exceeds its cpu resource allocation. + * If over max, kill it. In any case, if it has run for more + * than 10 minutes, reduce priority to give others a chance. + */ + rlim = &p->p_rlimit[RLIMIT_CPU]; + if (s >= rlim->rlim_cur) { + if (s >= rlim->rlim_max) + psignal(p, SIGKILL); + else { + psignal(p, SIGXCPU); + if (rlim->rlim_cur < rlim->rlim_max) + rlim->rlim_cur += 5; + } + } + if (s > 10 * 60 && p->p_ucred->cr_uid && p->p_nice == NZERO) { + p->p_nice = NZERO + 4; + resetpriority(p); + } + + /* + * Pick a new current process and record its start time. + */ + cnt.v_swtch++; + cpu_switch(p); + microtime(&runtime); +} + +/* + * Initialize the (doubly-linked) run queues + * to be empty. + */ +void +rqinit() +{ + register int i; + + for (i = 0; i < NQS; i++) + qs[i].ph_link = qs[i].ph_rlink = (struct proc *)&qs[i]; +} + +/* + * Change process state to be runnable, + * placing it on the run queue if it is in memory, + * and awakening the swapper if it isn't in memory. + */ +void +setrunnable(p) + register struct proc *p; +{ + register int s; + + s = splhigh(); + switch (p->p_stat) { + case 0: + case SRUN: + case SZOMB: + default: + panic("setrunnable"); + case SSTOP: + case SSLEEP: + unsleep(p); /* e.g. when sending signals */ + break; + + case SIDL: + break; + } + p->p_stat = SRUN; + if (p->p_flag & P_INMEM) + setrunqueue(p); + splx(s); + if (p->p_slptime > 1) + updatepri(p); + p->p_slptime = 0; + if ((p->p_flag & P_INMEM) == 0) + wakeup((caddr_t)&proc0); + else if (p->p_priority < curpriority) + need_resched(); +} + +/* + * Compute the priority of a process when running in user mode. + * Arrange to reschedule if the resulting priority is better + * than that of the current process. + */ +void +resetpriority(p) + register struct proc *p; +{ + register unsigned int newpriority; + + newpriority = PUSER + p->p_estcpu / 4 + 2 * p->p_nice; + newpriority = min(newpriority, MAXPRI); + p->p_usrpri = newpriority; + if (newpriority < curpriority) + need_resched(); +} diff --git a/sys/kern/kern_sysctl.c b/sys/kern/kern_sysctl.c new file mode 100644 index 000000000000..b178da3a0302 --- /dev/null +++ b/sys/kern/kern_sysctl.c @@ -0,0 +1,793 @@ +/*- + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Mike Karels at Berkeley Software Design, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)kern_sysctl.c 8.9 (Berkeley) 5/20/95 + */ + +/* + * sysctl system call. + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/proc.h> +#include <sys/file.h> +#include <sys/vnode.h> +#include <sys/unistd.h> +#include <sys/buf.h> +#include <sys/ioctl.h> +#include <sys/tty.h> +#include <vm/vm.h> +#include <sys/sysctl.h> + +#include <sys/mount.h> +#include <sys/syscallargs.h> + +sysctlfn kern_sysctl; +sysctlfn hw_sysctl; +#ifdef DEBUG +sysctlfn debug_sysctl; +#endif +extern sysctlfn vm_sysctl; +extern sysctlfn vfs_sysctl; +extern sysctlfn net_sysctl; +extern sysctlfn cpu_sysctl; + +/* + * Locking and stats + */ +static struct sysctl_lock { + int sl_lock; + int sl_want; + int sl_locked; +} memlock; + +int +__sysctl(p, uap, retval) + struct proc *p; + register struct __sysctl_args /* { + syscallarg(int *) name; + syscallarg(u_int) namelen; + syscallarg(void *) old; + syscallarg(size_t *) oldlenp; + syscallarg(void *) new; + syscallarg(size_t) newlen; + } */ *uap; + register_t *retval; +{ + int error, dolock = 1; + size_t savelen, oldlen = 0; + sysctlfn *fn; + int name[CTL_MAXNAME]; + + if (SCARG(uap, new) != NULL && + (error = suser(p->p_ucred, &p->p_acflag))) + return (error); + /* + * all top-level sysctl names are non-terminal + */ + if (SCARG(uap, namelen) > CTL_MAXNAME || SCARG(uap, namelen) < 2) + return (EINVAL); + if (error = + copyin(SCARG(uap, name), &name, SCARG(uap, namelen) * sizeof(int))) + return (error); + + switch (name[0]) { + case CTL_KERN: + fn = kern_sysctl; + if (name[2] == KERN_VNODE) /* XXX */ + dolock = 0; + break; + case CTL_HW: + fn = hw_sysctl; + break; + case CTL_VM: + fn = vm_sysctl; + break; + case CTL_NET: + fn = net_sysctl; + break; + case CTL_VFS: + fn = vfs_sysctl; + break; + case CTL_MACHDEP: + fn = cpu_sysctl; + break; +#ifdef DEBUG + case CTL_DEBUG: + fn = debug_sysctl; + break; +#endif + default: + return (EOPNOTSUPP); + } + + if (SCARG(uap, oldlenp) && + (error = copyin(SCARG(uap, oldlenp), &oldlen, sizeof(oldlen)))) + return (error); + if (SCARG(uap, old) != NULL) { + if (!useracc(SCARG(uap, old), oldlen, B_WRITE)) + return (EFAULT); + while (memlock.sl_lock) { + memlock.sl_want = 1; + sleep((caddr_t)&memlock, PRIBIO+1); + memlock.sl_locked++; + } + memlock.sl_lock = 1; + if (dolock) + vslock(SCARG(uap, old), oldlen); + savelen = oldlen; + } + error = (*fn)(name + 1, SCARG(uap, namelen) - 1, SCARG(uap, old), + &oldlen, SCARG(uap, new), SCARG(uap, newlen), p); + if (SCARG(uap, old) != NULL) { + if (dolock) + vsunlock(SCARG(uap, old), savelen, B_WRITE); + memlock.sl_lock = 0; + if (memlock.sl_want) { + memlock.sl_want = 0; + wakeup((caddr_t)&memlock); + } + } + if (error) + return (error); + if (SCARG(uap, oldlenp)) + error = copyout(&oldlen, SCARG(uap, oldlenp), sizeof(oldlen)); + *retval = oldlen; + return (0); +} + +/* + * Attributes stored in the kernel. + */ +char hostname[MAXHOSTNAMELEN]; +int hostnamelen; +long hostid; +int securelevel; + +/* + * kernel related system variables. + */ +kern_sysctl(name, namelen, oldp, oldlenp, newp, newlen, p) + int *name; + u_int namelen; + void *oldp; + size_t *oldlenp; + void *newp; + size_t newlen; + struct proc *p; +{ + int error, level, inthostid; + extern char ostype[], osrelease[], version[]; + + /* all sysctl names at this level are terminal */ + if (namelen != 1 && !(name[0] == KERN_PROC || name[0] == KERN_PROF)) + return (ENOTDIR); /* overloaded */ + + switch (name[0]) { + case KERN_OSTYPE: + return (sysctl_rdstring(oldp, oldlenp, newp, ostype)); + case KERN_OSRELEASE: + return (sysctl_rdstring(oldp, oldlenp, newp, osrelease)); + case KERN_OSREV: + return (sysctl_rdint(oldp, oldlenp, newp, BSD)); + case KERN_VERSION: + return (sysctl_rdstring(oldp, oldlenp, newp, version)); + case KERN_MAXVNODES: + return(sysctl_int(oldp, oldlenp, newp, newlen, &desiredvnodes)); + case KERN_MAXPROC: + return (sysctl_int(oldp, oldlenp, newp, newlen, &maxproc)); + case KERN_MAXFILES: + return (sysctl_int(oldp, oldlenp, newp, newlen, &maxfiles)); + case KERN_ARGMAX: + return (sysctl_rdint(oldp, oldlenp, newp, ARG_MAX)); + case KERN_SECURELVL: + level = securelevel; + if ((error = sysctl_int(oldp, oldlenp, newp, newlen, &level)) || + newp == NULL) + return (error); + if (level < securelevel && p->p_pid != 1) + return (EPERM); + securelevel = level; + return (0); + case KERN_HOSTNAME: + error = sysctl_string(oldp, oldlenp, newp, newlen, + hostname, sizeof(hostname)); + if (newp && !error) + hostnamelen = newlen; + return (error); + case KERN_HOSTID: + inthostid = hostid; /* XXX assumes sizeof long <= sizeof int */ + error = sysctl_int(oldp, oldlenp, newp, newlen, &inthostid); + hostid = inthostid; + return (error); + case KERN_CLOCKRATE: + return (sysctl_clockrate(oldp, oldlenp)); + case KERN_BOOTTIME: + return (sysctl_rdstruct(oldp, oldlenp, newp, &boottime, + sizeof(struct timeval))); + case KERN_VNODE: + return (sysctl_vnode(oldp, oldlenp, p)); + case KERN_PROC: + return (sysctl_doproc(name + 1, namelen - 1, oldp, oldlenp)); + case KERN_FILE: + return (sysctl_file(oldp, oldlenp)); +#ifdef GPROF + case KERN_PROF: + return (sysctl_doprof(name + 1, namelen - 1, oldp, oldlenp, + newp, newlen)); +#endif + case KERN_POSIX1: + return (sysctl_rdint(oldp, oldlenp, newp, _POSIX_VERSION)); + case KERN_NGROUPS: + return (sysctl_rdint(oldp, oldlenp, newp, NGROUPS_MAX)); + case KERN_JOB_CONTROL: + return (sysctl_rdint(oldp, oldlenp, newp, 1)); + case KERN_SAVED_IDS: +#ifdef _POSIX_SAVED_IDS + return (sysctl_rdint(oldp, oldlenp, newp, 1)); +#else + return (sysctl_rdint(oldp, oldlenp, newp, 0)); +#endif + default: + return (EOPNOTSUPP); + } + /* NOTREACHED */ +} + +/* + * hardware related system variables. + */ +hw_sysctl(name, namelen, oldp, oldlenp, newp, newlen, p) + int *name; + u_int namelen; + void *oldp; + size_t *oldlenp; + void *newp; + size_t newlen; + struct proc *p; +{ + extern char machine[], cpu_model[]; + + /* all sysctl names at this level are terminal */ + if (namelen != 1) + return (ENOTDIR); /* overloaded */ + + switch (name[0]) { + case HW_MACHINE: + return (sysctl_rdstring(oldp, oldlenp, newp, machine)); + case HW_MODEL: + return (sysctl_rdstring(oldp, oldlenp, newp, cpu_model)); + case HW_NCPU: + return (sysctl_rdint(oldp, oldlenp, newp, 1)); /* XXX */ + case HW_BYTEORDER: + return (sysctl_rdint(oldp, oldlenp, newp, BYTE_ORDER)); + case HW_PHYSMEM: + return (sysctl_rdint(oldp, oldlenp, newp, ctob(physmem))); + case HW_USERMEM: + return (sysctl_rdint(oldp, oldlenp, newp, + ctob(physmem - cnt.v_wire_count))); + case HW_PAGESIZE: + return (sysctl_rdint(oldp, oldlenp, newp, PAGE_SIZE)); + default: + return (EOPNOTSUPP); + } + /* NOTREACHED */ +} + +#ifdef DEBUG +/* + * Debugging related system variables. + */ +struct ctldebug debug0, debug1, debug2, debug3, debug4; +struct ctldebug debug5, debug6, debug7, debug8, debug9; +struct ctldebug debug10, debug11, debug12, debug13, debug14; +struct ctldebug debug15, debug16, debug17, debug18, debug19; +static struct ctldebug *debugvars[CTL_DEBUG_MAXID] = { + &debug0, &debug1, &debug2, &debug3, &debug4, + &debug5, &debug6, &debug7, &debug8, &debug9, + &debug10, &debug11, &debug12, &debug13, &debug14, + &debug15, &debug16, &debug17, &debug18, &debug19, +}; +int +debug_sysctl(name, namelen, oldp, oldlenp, newp, newlen, p) + int *name; + u_int namelen; + void *oldp; + size_t *oldlenp; + void *newp; + size_t newlen; + struct proc *p; +{ + struct ctldebug *cdp; + + /* all sysctl names at this level are name and field */ + if (namelen != 2) + return (ENOTDIR); /* overloaded */ + cdp = debugvars[name[0]]; + if (name[0] >= CTL_DEBUG_MAXID || cdp->debugname == 0) + return (EOPNOTSUPP); + switch (name[1]) { + case CTL_DEBUG_NAME: + return (sysctl_rdstring(oldp, oldlenp, newp, cdp->debugname)); + case CTL_DEBUG_VALUE: + return (sysctl_int(oldp, oldlenp, newp, newlen, cdp->debugvar)); + default: + return (EOPNOTSUPP); + } + /* NOTREACHED */ +} +#endif /* DEBUG */ + +/* + * Validate parameters and get old / set new parameters + * for an integer-valued sysctl function. + */ +sysctl_int(oldp, oldlenp, newp, newlen, valp) + void *oldp; + size_t *oldlenp; + void *newp; + size_t newlen; + int *valp; +{ + int error = 0; + + if (oldp && *oldlenp < sizeof(int)) + return (ENOMEM); + if (newp && newlen != sizeof(int)) + return (EINVAL); + *oldlenp = sizeof(int); + if (oldp) + error = copyout(valp, oldp, sizeof(int)); + if (error == 0 && newp) + error = copyin(newp, valp, sizeof(int)); + return (error); +} + +/* + * As above, but read-only. + */ +sysctl_rdint(oldp, oldlenp, newp, val) + void *oldp; + size_t *oldlenp; + void *newp; + int val; +{ + int error = 0; + + if (oldp && *oldlenp < sizeof(int)) + return (ENOMEM); + if (newp) + return (EPERM); + *oldlenp = sizeof(int); + if (oldp) + error = copyout((caddr_t)&val, oldp, sizeof(int)); + return (error); +} + +/* + * Validate parameters and get old / set new parameters + * for a string-valued sysctl function. + */ +sysctl_string(oldp, oldlenp, newp, newlen, str, maxlen) + void *oldp; + size_t *oldlenp; + void *newp; + size_t newlen; + char *str; + int maxlen; +{ + int len, error = 0; + + len = strlen(str) + 1; + if (oldp && *oldlenp < len) + return (ENOMEM); + if (newp && newlen >= maxlen) + return (EINVAL); + if (oldp) { + *oldlenp = len; + error = copyout(str, oldp, len); + } + if (error == 0 && newp) { + error = copyin(newp, str, newlen); + str[newlen] = 0; + } + return (error); +} + +/* + * As above, but read-only. + */ +sysctl_rdstring(oldp, oldlenp, newp, str) + void *oldp; + size_t *oldlenp; + void *newp; + char *str; +{ + int len, error = 0; + + len = strlen(str) + 1; + if (oldp && *oldlenp < len) + return (ENOMEM); + if (newp) + return (EPERM); + *oldlenp = len; + if (oldp) + error = copyout(str, oldp, len); + return (error); +} + +/* + * Validate parameters and get old / set new parameters + * for a structure oriented sysctl function. + */ +sysctl_struct(oldp, oldlenp, newp, newlen, sp, len) + void *oldp; + size_t *oldlenp; + void *newp; + size_t newlen; + void *sp; + int len; +{ + int error = 0; + + if (oldp && *oldlenp < len) + return (ENOMEM); + if (newp && newlen > len) + return (EINVAL); + if (oldp) { + *oldlenp = len; + error = copyout(sp, oldp, len); + } + if (error == 0 && newp) + error = copyin(newp, sp, len); + return (error); +} + +/* + * Validate parameters and get old parameters + * for a structure oriented sysctl function. + */ +sysctl_rdstruct(oldp, oldlenp, newp, sp, len) + void *oldp; + size_t *oldlenp; + void *newp, *sp; + int len; +{ + int error = 0; + + if (oldp && *oldlenp < len) + return (ENOMEM); + if (newp) + return (EPERM); + *oldlenp = len; + if (oldp) + error = copyout(sp, oldp, len); + return (error); +} + +/* + * Get file structures. + */ +sysctl_file(where, sizep) + char *where; + size_t *sizep; +{ + int buflen, error; + struct file *fp; + char *start = where; + + buflen = *sizep; + if (where == NULL) { + /* + * overestimate by 10 files + */ + *sizep = sizeof(filehead) + (nfiles + 10) * sizeof(struct file); + return (0); + } + + /* + * first copyout filehead + */ + if (buflen < sizeof(filehead)) { + *sizep = 0; + return (0); + } + if (error = copyout((caddr_t)&filehead, where, sizeof(filehead))) + return (error); + buflen -= sizeof(filehead); + where += sizeof(filehead); + + /* + * followed by an array of file structures + */ + for (fp = filehead.lh_first; fp != 0; fp = fp->f_list.le_next) { + if (buflen < sizeof(struct file)) { + *sizep = where - start; + return (ENOMEM); + } + if (error = copyout((caddr_t)fp, where, sizeof (struct file))) + return (error); + buflen -= sizeof(struct file); + where += sizeof(struct file); + } + *sizep = where - start; + return (0); +} + +/* + * try over estimating by 5 procs + */ +#define KERN_PROCSLOP (5 * sizeof (struct kinfo_proc)) + +sysctl_doproc(name, namelen, where, sizep) + int *name; + u_int namelen; + char *where; + size_t *sizep; +{ + register struct proc *p; + register struct kinfo_proc *dp = (struct kinfo_proc *)where; + register int needed = 0; + int buflen = where != NULL ? *sizep : 0; + int doingzomb; + struct eproc eproc; + int error = 0; + + if (namelen != 2 && !(namelen == 1 && name[0] == KERN_PROC_ALL)) + return (EINVAL); + p = allproc.lh_first; + doingzomb = 0; +again: + for (; p != 0; p = p->p_list.le_next) { + /* + * Skip embryonic processes. + */ + if (p->p_stat == SIDL) + continue; + /* + * TODO - make more efficient (see notes below). + * do by session. + */ + switch (name[0]) { + + case KERN_PROC_PID: + /* could do this with just a lookup */ + if (p->p_pid != (pid_t)name[1]) + continue; + break; + + case KERN_PROC_PGRP: + /* could do this by traversing pgrp */ + if (p->p_pgrp->pg_id != (pid_t)name[1]) + continue; + break; + + case KERN_PROC_TTY: + if ((p->p_flag & P_CONTROLT) == 0 || + p->p_session->s_ttyp == NULL || + p->p_session->s_ttyp->t_dev != (dev_t)name[1]) + continue; + break; + + case KERN_PROC_UID: + if (p->p_ucred->cr_uid != (uid_t)name[1]) + continue; + break; + + case KERN_PROC_RUID: + if (p->p_cred->p_ruid != (uid_t)name[1]) + continue; + break; + } + if (buflen >= sizeof(struct kinfo_proc)) { + fill_eproc(p, &eproc); + if (error = copyout((caddr_t)p, &dp->kp_proc, + sizeof(struct proc))) + return (error); + if (error = copyout((caddr_t)&eproc, &dp->kp_eproc, + sizeof(eproc))) + return (error); + dp++; + buflen -= sizeof(struct kinfo_proc); + } + needed += sizeof(struct kinfo_proc); + } + if (doingzomb == 0) { + p = zombproc.lh_first; + doingzomb++; + goto again; + } + if (where != NULL) { + *sizep = (caddr_t)dp - where; + if (needed > *sizep) + return (ENOMEM); + } else { + needed += KERN_PROCSLOP; + *sizep = needed; + } + return (0); +} + +/* + * Fill in an eproc structure for the specified process. + */ +void +fill_eproc(p, ep) + register struct proc *p; + register struct eproc *ep; +{ + register struct tty *tp; + + ep->e_paddr = p; + ep->e_sess = p->p_pgrp->pg_session; + ep->e_pcred = *p->p_cred; + ep->e_ucred = *p->p_ucred; + if (p->p_stat == SIDL || p->p_stat == SZOMB) { + ep->e_vm.vm_rssize = 0; + ep->e_vm.vm_tsize = 0; + ep->e_vm.vm_dsize = 0; + ep->e_vm.vm_ssize = 0; +#ifndef sparc + /* ep->e_vm.vm_pmap = XXX; */ +#endif + } else { + register struct vmspace *vm = p->p_vmspace; + +#ifdef pmap_resident_count + ep->e_vm.vm_rssize = pmap_resident_count(&vm->vm_pmap); /*XXX*/ +#else + ep->e_vm.vm_rssize = vm->vm_rssize; +#endif + ep->e_vm.vm_tsize = vm->vm_tsize; + ep->e_vm.vm_dsize = vm->vm_dsize; + ep->e_vm.vm_ssize = vm->vm_ssize; +#ifndef sparc + ep->e_vm.vm_pmap = vm->vm_pmap; +#endif + } + if (p->p_pptr) + ep->e_ppid = p->p_pptr->p_pid; + else + ep->e_ppid = 0; + ep->e_pgid = p->p_pgrp->pg_id; + ep->e_jobc = p->p_pgrp->pg_jobc; + if ((p->p_flag & P_CONTROLT) && + (tp = ep->e_sess->s_ttyp)) { + ep->e_tdev = tp->t_dev; + ep->e_tpgid = tp->t_pgrp ? tp->t_pgrp->pg_id : NO_PID; + ep->e_tsess = tp->t_session; + } else + ep->e_tdev = NODEV; + ep->e_flag = ep->e_sess->s_ttyvp ? EPROC_CTTY : 0; + if (SESS_LEADER(p)) + ep->e_flag |= EPROC_SLEADER; + if (p->p_wmesg) + strncpy(ep->e_wmesg, p->p_wmesg, WMESGLEN); + ep->e_xsize = ep->e_xrssize = 0; + ep->e_xccount = ep->e_xswrss = 0; +} + +#ifdef COMPAT_43 +#include <sys/socket.h> +#define KINFO_PROC (0<<8) +#define KINFO_RT (1<<8) +#define KINFO_VNODE (2<<8) +#define KINFO_FILE (3<<8) +#define KINFO_METER (4<<8) +#define KINFO_LOADAVG (5<<8) +#define KINFO_CLOCKRATE (6<<8) + +compat_43_getkerninfo(p, uap, retval) + struct proc *p; + register struct compat_43_getkerninfo_args /* { + syscallarg(int) op; + syscallarg(char *) where; + syscallarg(int *) size; + syscallarg(int) arg; + } */ *uap; + register_t *retval; +{ + int error, name[5]; + size_t size; + + if (SCARG(uap, size) && (error = copyin((caddr_t)SCARG(uap, size), + (caddr_t)&size, sizeof(size)))) + return (error); + + switch (SCARG(uap, op) & 0xff00) { + + case KINFO_RT: + name[0] = PF_ROUTE; + name[1] = 0; + name[2] = (SCARG(uap, op) & 0xff0000) >> 16; + name[3] = SCARG(uap, op) & 0xff; + name[4] = SCARG(uap, arg); + error = + net_sysctl(name, 5, SCARG(uap, where), &size, NULL, 0, p); + break; + + case KINFO_VNODE: + name[0] = KERN_VNODE; + error = + kern_sysctl(name, 1, SCARG(uap, where), &size, NULL, 0, p); + break; + + case KINFO_PROC: + name[0] = KERN_PROC; + name[1] = SCARG(uap, op) & 0xff; + name[2] = SCARG(uap, arg); + error = + kern_sysctl(name, 3, SCARG(uap, where), &size, NULL, 0, p); + break; + + case KINFO_FILE: + name[0] = KERN_FILE; + error = + kern_sysctl(name, 1, SCARG(uap, where), &size, NULL, 0, p); + break; + + case KINFO_METER: + name[0] = VM_METER; + error = + vm_sysctl(name, 1, SCARG(uap, where), &size, NULL, 0, p); + break; + + case KINFO_LOADAVG: + name[0] = VM_LOADAVG; + error = + vm_sysctl(name, 1, SCARG(uap, where), &size, NULL, 0, p); + break; + + case KINFO_CLOCKRATE: + name[0] = KERN_CLOCKRATE; + error = + kern_sysctl(name, 1, SCARG(uap, where), &size, NULL, 0, p); + break; + + default: + return (EOPNOTSUPP); + } + if (error) + return (error); + *retval = size; + if (SCARG(uap, size)) + error = copyout((caddr_t)&size, (caddr_t)SCARG(uap, size), + sizeof(size)); + return (error); +} +#endif /* COMPAT_43 */ diff --git a/sys/kern/kern_time.c b/sys/kern/kern_time.c new file mode 100644 index 000000000000..f4facf6f9fa0 --- /dev/null +++ b/sys/kern/kern_time.c @@ -0,0 +1,433 @@ +/* + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)kern_time.c 8.4 (Berkeley) 5/26/95 + */ + +#include <sys/param.h> +#include <sys/resourcevar.h> +#include <sys/kernel.h> +#include <sys/systm.h> +#include <sys/proc.h> +#include <sys/vnode.h> + +#include <sys/mount.h> +#include <sys/syscallargs.h> + +#include <machine/cpu.h> + +/* + * Time of day and interval timer support. + * + * These routines provide the kernel entry points to get and set + * the time-of-day and per-process interval timers. Subroutines + * here provide support for adding and subtracting timeval structures + * and decrementing interval timers, optionally reloading the interval + * timers when they expire. + */ + +/* ARGSUSED */ +int +gettimeofday(p, uap, retval) + struct proc *p; + register struct gettimeofday_args /* { + syscallarg(struct timeval *) tp; + syscallarg(struct timezone *) tzp; + } */ *uap; + register_t *retval; +{ + struct timeval atv; + int error = 0; + + if (SCARG(uap, tp)) { + microtime(&atv); + if (error = copyout((caddr_t)&atv, (caddr_t)SCARG(uap, tp), + sizeof (atv))) + return (error); + } + if (SCARG(uap, tzp)) + error = copyout((caddr_t)&tz, (caddr_t)SCARG(uap, tzp), + sizeof (tz)); + return (error); +} + +/* ARGSUSED */ +int +settimeofday(p, uap, retval) + struct proc *p; + struct settimeofday_args /* { + syscallarg(struct timeval *) tv; + syscallarg(struct timezone *) tzp; + } */ *uap; + register_t *retval; +{ + struct timeval atv, delta; + struct timezone atz; + int error, s; + + if (error = suser(p->p_ucred, &p->p_acflag)) + return (error); + /* Verify all parameters before changing time. */ + if (SCARG(uap, tv) && (error = copyin((caddr_t)SCARG(uap, tv), + (caddr_t)&atv, sizeof(atv)))) + return (error); + if (SCARG(uap, tzp) && (error = copyin((caddr_t)SCARG(uap, tzp), + (caddr_t)&atz, sizeof(atz)))) + return (error); + if (SCARG(uap, tv)) { + /* + * If the system is secure, we do not allow the time to be + * set to an earlier value (it may be slowed using adjtime, + * but not set back). This feature prevent interlopers from + * setting arbitrary time stamps on files. + */ + if (securelevel > 0 && timercmp(&atv, &time, <)) + return (EPERM); + /* WHAT DO WE DO ABOUT PENDING REAL-TIME TIMEOUTS??? */ + s = splclock(); + /* nb. delta.tv_usec may be < 0, but this is OK here */ + delta.tv_sec = atv.tv_sec - time.tv_sec; + delta.tv_usec = atv.tv_usec - time.tv_usec; + time = atv; + (void) splsoftclock(); + timevaladd(&boottime, &delta); + timevalfix(&boottime); + timevaladd(&runtime, &delta); + timevalfix(&runtime); +# ifdef NFS + lease_updatetime(delta.tv_sec); +# endif + splx(s); + resettodr(); + } + if (SCARG(uap, tzp)) + tz = atz; + return (0); +} + +extern int tickadj; /* "standard" clock skew, us./tick */ +int tickdelta; /* current clock skew, us. per tick */ +long timedelta; /* unapplied time correction, us. */ +long bigadj = 1000000; /* use 10x skew above bigadj us. */ + +/* ARGSUSED */ +int +adjtime(p, uap, retval) + struct proc *p; + register struct adjtime_args /* { + syscallarg(struct timeval *) delta; + syscallarg(struct timeval *) olddelta; + } */ *uap; + register_t *retval; +{ + struct timeval atv; + register long ndelta, ntickdelta, odelta; + int s, error; + + if (error = suser(p->p_ucred, &p->p_acflag)) + return (error); + if (error = copyin((caddr_t)SCARG(uap, delta), (caddr_t)&atv, + sizeof(struct timeval))) + return (error); + + /* + * Compute the total correction and the rate at which to apply it. + * Round the adjustment down to a whole multiple of the per-tick + * delta, so that after some number of incremental changes in + * hardclock(), tickdelta will become zero, lest the correction + * overshoot and start taking us away from the desired final time. + */ + ndelta = atv.tv_sec * 1000000 + atv.tv_usec; + if (ndelta > bigadj) + ntickdelta = 10 * tickadj; + else + ntickdelta = tickadj; + if (ndelta % ntickdelta) + ndelta = ndelta / ntickdelta * ntickdelta; + + /* + * To make hardclock()'s job easier, make the per-tick delta negative + * if we want time to run slower; then hardclock can simply compute + * tick + tickdelta, and subtract tickdelta from timedelta. + */ + if (ndelta < 0) + ntickdelta = -ntickdelta; + s = splclock(); + odelta = timedelta; + timedelta = ndelta; + tickdelta = ntickdelta; + splx(s); + + if (SCARG(uap, olddelta)) { + atv.tv_sec = odelta / 1000000; + atv.tv_usec = odelta % 1000000; + (void) copyout((caddr_t)&atv, (caddr_t)SCARG(uap, olddelta), + sizeof(struct timeval)); + } + return (0); +} + +/* + * Get value of an interval timer. The process virtual and + * profiling virtual time timers are kept in the p_stats area, since + * they can be swapped out. These are kept internally in the + * way they are specified externally: in time until they expire. + * + * The real time interval timer is kept in the process table slot + * for the process, and its value (it_value) is kept as an + * absolute time rather than as a delta, so that it is easy to keep + * periodic real-time signals from drifting. + * + * Virtual time timers are processed in the hardclock() routine of + * kern_clock.c. The real time timer is processed by a timeout + * routine, called from the softclock() routine. Since a callout + * may be delayed in real time due to interrupt processing in the system, + * it is possible for the real time timeout routine (realitexpire, given below), + * to be delayed in real time past when it is supposed to occur. It + * does not suffice, therefore, to reload the real timer .it_value from the + * real time timers .it_interval. Rather, we compute the next time in + * absolute time the timer should go off. + */ +/* ARGSUSED */ +int +getitimer(p, uap, retval) + struct proc *p; + register struct getitimer_args /* { + syscallarg(u_int) which; + syscallarg(struct itimerval *) itv; + } */ *uap; + register_t *retval; +{ + struct itimerval aitv; + int s; + + if (SCARG(uap, which) > ITIMER_PROF) + return (EINVAL); + s = splclock(); + if (SCARG(uap, which) == ITIMER_REAL) { + /* + * Convert from absolute to relative time in .it_value + * part of real time timer. If time for real time timer + * has passed return 0, else return difference between + * current time and time for the timer to go off. + */ + aitv = p->p_realtimer; + if (timerisset(&aitv.it_value)) + if (timercmp(&aitv.it_value, &time, <)) + timerclear(&aitv.it_value); + else + timevalsub(&aitv.it_value, + (struct timeval *)&time); + } else + aitv = p->p_stats->p_timer[SCARG(uap, which)]; + splx(s); + return (copyout((caddr_t)&aitv, (caddr_t)SCARG(uap, itv), + sizeof (struct itimerval))); +} + +/* ARGSUSED */ +int +setitimer(p, uap, retval) + struct proc *p; + register struct setitimer_args /* { + syscallarg(u_int) which; + syscallarg(struct itimerval *) itv; + syscallarg(struct itimerval *) oitv; + } */ *uap; + register_t *retval; +{ + struct itimerval aitv; + register struct itimerval *itvp; + int s, error; + + if (SCARG(uap, which) > ITIMER_PROF) + return (EINVAL); + itvp = SCARG(uap, itv); + if (itvp && (error = copyin((caddr_t)itvp, (caddr_t)&aitv, + sizeof(struct itimerval)))) + return (error); + if ((SCARG(uap, itv) = SCARG(uap, oitv)) && + (error = getitimer(p, uap, retval))) + return (error); + if (itvp == 0) + return (0); + if (itimerfix(&aitv.it_value) || itimerfix(&aitv.it_interval)) + return (EINVAL); + s = splclock(); + if (SCARG(uap, which) == ITIMER_REAL) { + untimeout(realitexpire, (caddr_t)p); + if (timerisset(&aitv.it_value)) { + timevaladd(&aitv.it_value, (struct timeval *)&time); + timeout(realitexpire, (caddr_t)p, hzto(&aitv.it_value)); + } + p->p_realtimer = aitv; + } else + p->p_stats->p_timer[SCARG(uap, which)] = aitv; + splx(s); + return (0); +} + +/* + * Real interval timer expired: + * send process whose timer expired an alarm signal. + * If time is not set up to reload, then just return. + * Else compute next time timer should go off which is > current time. + * This is where delay in processing this timeout causes multiple + * SIGALRM calls to be compressed into one. + */ +void +realitexpire(arg) + void *arg; +{ + register struct proc *p; + int s; + + p = (struct proc *)arg; + psignal(p, SIGALRM); + if (!timerisset(&p->p_realtimer.it_interval)) { + timerclear(&p->p_realtimer.it_value); + return; + } + for (;;) { + s = splclock(); + timevaladd(&p->p_realtimer.it_value, + &p->p_realtimer.it_interval); + if (timercmp(&p->p_realtimer.it_value, &time, >)) { + timeout(realitexpire, (caddr_t)p, + hzto(&p->p_realtimer.it_value)); + splx(s); + return; + } + splx(s); + } +} + +/* + * Check that a proposed value to load into the .it_value or + * .it_interval part of an interval timer is acceptable, and + * fix it to have at least minimal value (i.e. if it is less + * than the resolution of the clock, round it up.) + */ +int +itimerfix(tv) + struct timeval *tv; +{ + + if (tv->tv_sec < 0 || tv->tv_sec > 100000000 || + tv->tv_usec < 0 || tv->tv_usec >= 1000000) + return (EINVAL); + if (tv->tv_sec == 0 && tv->tv_usec != 0 && tv->tv_usec < tick) + tv->tv_usec = tick; + return (0); +} + +/* + * Decrement an interval timer by a specified number + * of microseconds, which must be less than a second, + * i.e. < 1000000. If the timer expires, then reload + * it. In this case, carry over (usec - old value) to + * reduce the value reloaded into the timer so that + * the timer does not drift. This routine assumes + * that it is called in a context where the timers + * on which it is operating cannot change in value. + */ +int +itimerdecr(itp, usec) + register struct itimerval *itp; + int usec; +{ + + if (itp->it_value.tv_usec < usec) { + if (itp->it_value.tv_sec == 0) { + /* expired, and already in next interval */ + usec -= itp->it_value.tv_usec; + goto expire; + } + itp->it_value.tv_usec += 1000000; + itp->it_value.tv_sec--; + } + itp->it_value.tv_usec -= usec; + usec = 0; + if (timerisset(&itp->it_value)) + return (1); + /* expired, exactly at end of interval */ +expire: + if (timerisset(&itp->it_interval)) { + itp->it_value = itp->it_interval; + itp->it_value.tv_usec -= usec; + if (itp->it_value.tv_usec < 0) { + itp->it_value.tv_usec += 1000000; + itp->it_value.tv_sec--; + } + } else + itp->it_value.tv_usec = 0; /* sec is already 0 */ + return (0); +} + +/* + * Add and subtract routines for timevals. + * N.B.: subtract routine doesn't deal with + * results which are before the beginning, + * it just gets very confused in this case. + * Caveat emptor. + */ +timevaladd(t1, t2) + struct timeval *t1, *t2; +{ + + t1->tv_sec += t2->tv_sec; + t1->tv_usec += t2->tv_usec; + timevalfix(t1); +} + +timevalsub(t1, t2) + struct timeval *t1, *t2; +{ + + t1->tv_sec -= t2->tv_sec; + t1->tv_usec -= t2->tv_usec; + timevalfix(t1); +} + +timevalfix(t1) + struct timeval *t1; +{ + + if (t1->tv_usec < 0) { + t1->tv_sec--; + t1->tv_usec += 1000000; + } + if (t1->tv_usec >= 1000000) { + t1->tv_sec++; + t1->tv_usec -= 1000000; + } +} diff --git a/sys/kern/kern_xxx.c b/sys/kern/kern_xxx.c new file mode 100644 index 000000000000..caa1cdd10c00 --- /dev/null +++ b/sys/kern/kern_xxx.c @@ -0,0 +1,143 @@ +/* + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)kern_xxx.c 8.3 (Berkeley) 2/14/95 + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/proc.h> +#include <sys/reboot.h> +#include <vm/vm.h> +#include <sys/sysctl.h> + +#include <sys/mount.h> +#include <sys/syscallargs.h> + +/* ARGSUSED */ +int +reboot(p, uap, retval) + struct proc *p; + struct reboot_args /* { + syscallarg(int) opt; + } */ *uap; + register_t *retval; +{ + int error; + + if (error = suser(p->p_ucred, &p->p_acflag)) + return (error); + boot(SCARG(uap, opt)); + return (0); +} + +#if defined(COMPAT_43) || defined(COMPAT_SUNOS) + +/* ARGSUSED */ +int +compat_43_gethostname(p, uap, retval) + struct proc *p; + struct compat_43_gethostname_args /* { + syscallarg(char *) hostname; + syscallarg(u_int) len; + } */ *uap; + register_t *retval; +{ + int name; + + name = KERN_HOSTNAME; + return (kern_sysctl(&name, 1, SCARG(uap, hostname), &SCARG(uap, len), + 0, 0)); +} + +/* ARGSUSED */ +int +compat_43_sethostname(p, uap, retval) + struct proc *p; + register struct compat_43_sethostname_args /* { + syscallarg(char *) hostname; + syscallarg(u_int) len; + } */ *uap; + register_t *retval; +{ + int name; + int error; + + if (error = suser(p->p_ucred, &p->p_acflag)) + return (error); + name = KERN_HOSTNAME; + return (kern_sysctl(&name, 1, 0, 0, SCARG(uap, hostname), + SCARG(uap, len))); +} + +/* ARGSUSED */ +int +compat_43_gethostid(p, uap, retval) + struct proc *p; + void *uap; + register_t *retval; +{ + + *(int32_t *)retval = hostid; + return (0); +} +#endif /* COMPAT_43 || COMPAT_SUNOS */ + +#ifdef COMPAT_43 +/* ARGSUSED */ +int +compat_43_sethostid(p, uap, retval) + struct proc *p; + struct compat_43_sethostid_args /* { + syscallarg(int32_t) hostid; + } */ *uap; + register_t *retval; +{ + int error; + + if (error = suser(p->p_ucred, &p->p_acflag)) + return (error); + hostid = SCARG(uap, hostid); + return (0); +} + +int +compat_43_quota(p, uap, retval) + struct proc *p; + void *uap; + register_t *retval; +{ + + return (ENOSYS); +} +#endif /* COMPAT_43 */ diff --git a/sys/kern/makesyscalls.sh b/sys/kern/makesyscalls.sh new file mode 100644 index 000000000000..4e2c28c44e02 --- /dev/null +++ b/sys/kern/makesyscalls.sh @@ -0,0 +1,365 @@ +#! /bin/sh - +# +# @(#)makesyscalls.sh 8.2 (Berkeley) 2/14/95 + +set -e + +case $# in + 2) ;; + *) echo "Usage: $0 config-file input-file" 1>&2 + exit 1 + ;; +esac + +# source the config file. +. $1 + +# the config file sets the following variables: +# sysnames the syscall names file +# sysnumhdr the syscall numbers file +# syssw the syscall switch file +# sysarghdr the syscall argument struct definitions +# compatopts those syscall types that are for 'compat' syscalls +# switchname the name for the 'struct sysent' we define +# namesname the name for the 'char *[]' we define +# constprefix the prefix for the system call constants +# +# NOTE THAT THIS makesyscalls.sh DOES NOT SUPPORT 'LIBCOMPAT'. + +# tmp files: +sysdcl="sysent.dcl" +syscompat_pref="sysent." +sysent="sysent.switch" + +syscompat_files="" +for file in $compatopts; do + syscompat_files="$syscompat_files $syscompat_pref$file" +done + +trap "rm $sysdcl $syscompat_files $sysent" 0 + +# Awk program (must support nawk extensions) +# Use "awk" at Berkeley, "nawk" or "gawk" elsewhere. +awk=${AWK:-awk} + +# Does this awk have a "toupper" function? (i.e. is it GNU awk) +isgawk=`$awk 'BEGIN { print toupper("true"); exit; }' 2>/dev/null` + +# If this awk does not define "toupper" then define our own. +if [ "$isgawk" = TRUE ] ; then + # GNU awk provides it. + toupper= +else + # Provide our own toupper() + toupper=' +function toupper(str) { + _toupper_cmd = "echo "str" |tr a-z A-Z" + _toupper_cmd | getline _toupper_str; + close(_toupper_cmd); + return _toupper_str; +}' +fi + +# before handing it off to awk, make a few adjustments: +# (1) insert spaces around {, }, (, ), *, and commas. +# (2) get rid of any and all dollar signs (so that rcs id use safe) +# +# The awk script will deal with blank lines and lines that +# start with the comment character (';'). + +sed -e ' +s/\$//g +:join + /\\$/{a\ + + N + s/\\\n// + b join + } +2,${ + /^#/!s/\([{}()*,]\)/ \1 /g +} +' < $2 | $awk " +$toupper +BEGIN { + sysnames = \"$sysnames\" + sysnumhdr = \"$sysnumhdr\" + sysarghdr = \"$sysarghdr\" + switchname = \"$switchname\" + namesname = \"$namesname\" + constprefix = \"$constprefix\" + + sysdcl = \"$sysdcl\" + syscompat_pref = \"$syscompat_pref\" + sysent = \"$sysent\" + infile = \"$2\" + + compatopts = \"$compatopts\" + "' + + printf "/*\n * System call switch table.\n *\n" > sysdcl + printf " * DO NOT EDIT-- this file is automatically generated.\n" > sysdcl + + ncompat = split(compatopts,compat) + for (i = 1; i <= ncompat; i++) { + compat_upper[i] = toupper(compat[i]) + compat_file[i] = sprintf("%s%s", syscompat_pref, compat[i]) + + printf "\n#ifdef %s\n", compat_upper[i] > compat_file[i] + printf "#define %s(func) __CONCAT(%s_,func)\n\n", \ + compat[i], compat[i] > compat_file[i] + } + + printf "/*\n * System call names.\n *\n" > sysnames + printf " * DO NOT EDIT-- this file is automatically generated.\n" > sysnames + + printf "/*\n * System call numbers.\n *\n" > sysnumhdr + printf " * DO NOT EDIT-- this file is automatically generated.\n" > sysnumhdr + + printf "/*\n * System call argument lists.\n *\n" > sysarghdr + printf " * DO NOT EDIT-- this file is automatically generated.\n" > sysarghdr +} +NR == 1 { + printf " * created from%s\n */\n\n", $0 > sysdcl + + printf "#define\ts(type)\tsizeof(type)\n\n" > sysent + printf "struct sysent %s[] = {\n",switchname > sysent + + printf " * created from%s\n */\n\n", $0 > sysnames + printf "char *%s[] = {\n",namesname > sysnames + + printf " * created from%s\n */\n\n", $0 > sysnumhdr + + printf " * created from%s\n */\n\n", $0 > sysarghdr + printf "#define\tsyscallarg(x)\tunion { x datum; register_t pad; }\n" \ + > sysarghdr + next +} +NF == 0 || $1 ~ /^;/ { + next +} +$1 ~ /^#[ ]*include/ { + print > sysdcl + next +} +$1 ~ /^#[ ]*if/ { + print > sysent + print > sysdcl + for (i = 1; i <= ncompat; i++) + print > compat_file[i] + print > sysnames + savesyscall = syscall + next +} +$1 ~ /^#[ ]*else/ { + print > sysent + print > sysdcl + for (i = 1; i <= ncompat; i++) + print > compat_file[i] + print > sysnames + syscall = savesyscall + next +} +$1 ~ /^#/ { + print > sysent + print > sysdcl + for (i = 1; i <= ncompat; i++) + print > compat_file[i] + print > sysnames + next +} +syscall != $1 { + printf "%s: line %d: syscall number out of sync at %d\n", \ + infile, NR, syscall + printf "line is:\n" + print + exit 1 +} +function parserr(was, wanted) { + printf "%s: line %d: unexpected %s (expected %s)\n", \ + infile, NR, was, wanted + exit 1 +} +function parseline() { + f=3 # toss number and type + if ($NF != "}") { + funcalias=$NF + end=NF-1 + } else { + funcalias="" + end=NF + } + if ($f != "{") + parserr($f, "{") + f++ + if ($end != "}") + parserr($end, "}") + end-- + if ($end != ";") + parserr($end, ";") + end-- + if ($end != ")") + parserr($end, ")") + end-- + + f++ # toss return type + + funcname=$f + if (funcalias == "") + funcalias=funcname + f++ + + if ($f != "(") + parserr($f, ")") + f++ + + argc= 0; + if (f == end) { + if ($f != "void") + parserr($f, "argument definition") + return + } + + while (f <= end) { + argc++ + argtype[argc]="" + oldf="" + while (f < end && $(f+1) != ",") { + if (argtype[argc] != "" && oldf != "*") + argtype[argc] = argtype[argc]" "; + argtype[argc] = argtype[argc]$f; + oldf = $f; + f++ + } + if (argtype[argc] == "") + parserr($f, "argument definition") + argname[argc]=$f; + f += 2; # skip name, and any comma + } +} +function putent(nodefs, declfile, compatwrap) { + # output syscall declaration for switch table + if (compatwrap == "") + printf("int\t%s();\n", funcname) > declfile + else + printf("int\t%s(%s)();\n", compatwrap, funcname) > declfile + + # output syscall switch entry +# printf("\t{ { %d", argc) > sysent +# for (i = 1; i <= argc; i++) { +# if (i == 5) # wrap the line +# printf(",\n\t ") > sysent +# else +# printf(", ") > sysent +# printf("s(%s)", argtypenospc[i]) > sysent +# } + printf("\t{ %d, ", argc) > sysent + if (argc == 0) + printf("0") > sysent + else if (compatwrap == "") + printf("s(struct %s_args)", funcname) > sysent + else + printf("s(struct %s_%s_args)", compatwrap, funcname) > sysent + if (compatwrap == "") + wfn = sprintf("%s", funcname); + else + wfn = sprintf("%s(%s)", compatwrap, funcname); + printf(",\n\t %s },", wfn) > sysent + for (i = 0; i < (33 - length(wfn)) / 8; i++) + printf("\t") > sysent + if (compatwrap == "") + printf("/* %d = %s */\n", syscall, funcalias) > sysent + else + printf("/* %d = %s %s */\n", syscall, compatwrap, + funcalias) > sysent + + # output syscall name for names table + if (compatwrap == "") + printf("\t\"%s\",\t\t\t/* %d = %s */\n", funcalias, syscall, + funcalias) > sysnames + else + printf("\t\"%s_%s\",\t/* %d = %s %s */\n", compatwrap, + funcalias, syscall, compatwrap, funcalias) > sysnames + + # output syscall number of header, if appropriate + if (nodefs == "" || nodefs == "NOARGS") + printf("#define\t%s%s\t%d\n", constprefix, funcalias, + syscall) > sysnumhdr + else if (nodefs != "NODEF") + printf("\t\t\t\t/* %d is %s %s */\n", syscall, + compatwrap, funcalias) > sysnumhdr + + # output syscall argument structure, if it has arguments + if (argc != 0 && nodefs != "NOARGS") { + if (compatwrap == "") + printf("\nstruct %s_args {\n", funcname) > sysarghdr + else + printf("\nstruct %s_%s_args {\n", compatwrap, + funcname) > sysarghdr + for (i = 1; i <= argc; i++) + printf("\tsyscallarg(%s) %s;\n", argtype[i], + argname[i]) > sysarghdr + printf("};\n") > sysarghdr + } +} +$2 == "STD" { + parseline() + putent("", sysdcl, "") + syscall++ + next +} +$2 == "NODEF" || $2 == "NOARGS" { + parseline() + putent($2, sysdcl, "") + syscall++ + next +} +$2 == "OBSOL" || $2 == "UNIMPL" { + if ($2 == "OBSOL") + comment="obsolete" + else + comment="unimplemented" + for (i = 3; i <= NF; i++) + comment=comment " " $i + + printf("\t{ 0, 0,\n\t nosys },\t\t\t\t/* %d = %s */\n", \ + syscall, comment) > sysent + printf("\t\"#%d (%s)\",\t\t/* %d = %s */\n", \ + syscall, comment, syscall, comment) > sysnames + if ($2 != "UNIMPL") + printf("\t\t\t\t/* %d is %s */\n", syscall, comment) > sysnumhdr + syscall++ + next +} +{ + for (i = 1; i <= ncompat; i++) { + if ($2 == compat_upper[i]) { + parseline(); + putent("COMMENT", compat_file[i], compat[i]) + syscall++ + next + } + } + printf "%s: line %d: unrecognized keyword %s\n", infile, NR, $2 + exit 1 +} +END { + printf "\n#undef\tsyscallarg\n" > sysarghdr + + for (i = 1; i <= ncompat; i++) { + printf("\n#else /* %s */\n", compat_upper[i]) > compat_file[i] + printf("#define %s(func) nosys\n", compat[i]) > \ + compat_file[i] + printf("#endif /* %s */\n\n", compat_upper[i]) > compat_file[i] + } + + printf("};\n\n") > sysent + printf("int\tn%s= sizeof(%s) / sizeof(%s[0]);\n", switchname, + switchname, switchname) > sysent + + printf("};\n") > sysnames +} ' + +cat $sysdcl $syscompat_files $sysent > $syssw + +#chmod 444 $sysnames $syshdr $syssw diff --git a/sys/kern/subr_autoconf.c b/sys/kern/subr_autoconf.c new file mode 100644 index 000000000000..728133978adb --- /dev/null +++ b/sys/kern/subr_autoconf.c @@ -0,0 +1,345 @@ +/* + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * This software was developed by the Computer Systems Engineering group + * at Lawrence Berkeley Laboratory under DARPA contract BG 91-66 and + * contributed to Berkeley. + * + * All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Lawrence Berkeley Laboratories. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)subr_autoconf.c 8.3 (Berkeley) 5/17/94 + * + * from: $Header: subr_autoconf.c,v 1.12 93/02/01 19:31:48 torek Exp $ (LBL) + */ + +#include <sys/param.h> +#include <sys/device.h> +#include <sys/malloc.h> +#include <libkern/libkern.h> + +/* + * Autoconfiguration subroutines. + */ + +/* + * ioconf.c exports exactly two names: cfdata and cfroots. All system + * devices and drivers are found via these tables. + */ +extern struct cfdata cfdata[]; +extern short cfroots[]; + +#define ROOT ((struct device *)NULL) + +struct matchinfo { + cfmatch_t fn; + struct device *parent; + void *aux; + struct cfdata *match; + int pri; +}; + +/* + * Apply the matching function and choose the best. This is used + * a few times and we want to keep the code small. + */ +static void +mapply(m, cf) + register struct matchinfo *m; + register struct cfdata *cf; +{ + register int pri; + + if (m->fn != NULL) + pri = (*m->fn)(m->parent, cf, m->aux); + else + pri = (*cf->cf_driver->cd_match)(m->parent, cf, m->aux); + if (pri > m->pri) { + m->match = cf; + m->pri = pri; + } +} + +/* + * Iterate over all potential children of some device, calling the given + * function (default being the child's match function) for each one. + * Nonzero returns are matches; the highest value returned is considered + * the best match. Return the `found child' if we got a match, or NULL + * otherwise. The `aux' pointer is simply passed on through. + * + * Note that this function is designed so that it can be used to apply + * an arbitrary function to all potential children (its return value + * can be ignored). + */ +struct cfdata * +config_search(fn, parent, aux) + cfmatch_t fn; + register struct device *parent; + void *aux; +{ + register struct cfdata *cf; + register short *p; + struct matchinfo m; + + m.fn = fn; + m.parent = parent; + m.aux = aux; + m.match = NULL; + m.pri = 0; + for (cf = cfdata; cf->cf_driver; cf++) { + /* + * Skip cf if no longer eligible, otherwise scan through + * parents for one matching `parent', and try match function. + */ + if (cf->cf_fstate == FSTATE_FOUND) + continue; + for (p = cf->cf_parents; *p >= 0; p++) + if (parent->dv_cfdata == &cfdata[*p]) + mapply(&m, cf); + } + return (m.match); +} + +/* + * Find the given root device. + * This is much like config_search, but there is no parent. + */ +struct cfdata * +config_rootsearch(fn, rootname, aux) + register cfmatch_t fn; + register char *rootname; + register void *aux; +{ + register struct cfdata *cf; + register short *p; + struct matchinfo m; + + m.fn = fn; + m.parent = ROOT; + m.aux = aux; + m.match = NULL; + m.pri = 0; + /* + * Look at root entries for matching name. We do not bother + * with found-state here since only one root should ever be + * searched (and it must be done first). + */ + for (p = cfroots; *p >= 0; p++) { + cf = &cfdata[*p]; + if (strcmp(cf->cf_driver->cd_name, rootname) == 0) + mapply(&m, cf); + } + return (m.match); +} + +static char *msgs[3] = { "", " not configured\n", " unsupported\n" }; + +/* + * The given `aux' argument describes a device that has been found + * on the given parent, but not necessarily configured. Locate the + * configuration data for that device (using the cd_match configuration + * driver function) and attach it, and return true. If the device was + * not configured, call the given `print' function and return 0. + */ +int +config_found(parent, aux, print) + struct device *parent; + void *aux; + cfprint_t print; +{ + struct cfdata *cf; + + if ((cf = config_search((cfmatch_t)NULL, parent, aux)) != NULL) { + config_attach(parent, cf, aux, print); + return (1); + } + printf(msgs[(*print)(aux, parent->dv_xname)]); + return (0); +} + +/* + * As above, but for root devices. + */ +int +config_rootfound(rootname, aux) + char *rootname; + void *aux; +{ + struct cfdata *cf; + + if ((cf = config_rootsearch((cfmatch_t)NULL, rootname, aux)) != NULL) { + config_attach(ROOT, cf, aux, (cfprint_t)NULL); + return (1); + } + printf("root device %s not configured\n", rootname); + return (0); +} + +/* just like sprintf(buf, "%d") except that it works from the end */ +static char * +number(ep, n) + register char *ep; + register int n; +{ + + *--ep = 0; + while (n >= 10) { + *--ep = (n % 10) + '0'; + n /= 10; + } + *--ep = n + '0'; + return (ep); +} + +/* + * Attach a found device. Allocates memory for device variables. + */ +void +config_attach(parent, cf, aux, print) + register struct device *parent; + register struct cfdata *cf; + register void *aux; + cfprint_t print; +{ + register struct device *dev; + register struct cfdriver *cd; + register size_t lname, lunit; + register char *xunit; + int myunit; + char num[10]; + static struct device **nextp = &alldevs; + + cd = cf->cf_driver; + if (cd->cd_devsize < sizeof(struct device)) + panic("config_attach"); + myunit = cf->cf_unit; + if (cf->cf_fstate == FSTATE_NOTFOUND) + cf->cf_fstate = FSTATE_FOUND; + else + cf->cf_unit++; + + /* compute length of name and decimal expansion of unit number */ + lname = strlen(cd->cd_name); + xunit = number(&num[sizeof num], myunit); + lunit = &num[sizeof num] - xunit; + if (lname + lunit >= sizeof(dev->dv_xname)) + panic("config_attach: device name too long"); + + /* get memory for all device vars */ + dev = (struct device *)malloc(cd->cd_devsize, M_DEVBUF, M_WAITOK); + /* XXX cannot wait! */ + bzero(dev, cd->cd_devsize); + *nextp = dev; /* link up */ + nextp = &dev->dv_next; + dev->dv_class = cd->cd_class; + dev->dv_cfdata = cf; + dev->dv_unit = myunit; + bcopy(cd->cd_name, dev->dv_xname, lname); + bcopy(xunit, dev->dv_xname + lname, lunit); + dev->dv_parent = parent; + if (parent == ROOT) + printf("%s (root)", dev->dv_xname); + else { + printf("%s at %s", dev->dv_xname, parent->dv_xname); + (void) (*print)(aux, (char *)0); + } + + /* put this device in the devices array */ + if (dev->dv_unit >= cd->cd_ndevs) { + /* + * Need to expand the array. + */ + int old = cd->cd_ndevs, oldbytes, new, newbytes; + void **nsp; + + if (old == 0) { + new = max(MINALLOCSIZE / sizeof(void *), + dev->dv_unit + 1); + newbytes = new * sizeof(void *); + nsp = malloc(newbytes, M_DEVBUF, M_WAITOK); /*XXX*/ + bzero(nsp, newbytes); + } else { + new = cd->cd_ndevs; + do { + new *= 2; + } while (new <= dev->dv_unit); + oldbytes = old * sizeof(void *); + newbytes = new * sizeof(void *); + nsp = malloc(newbytes, M_DEVBUF, M_WAITOK); /*XXX*/ + bcopy(cd->cd_devs, nsp, oldbytes); + bzero(&nsp[old], newbytes - oldbytes); + free(cd->cd_devs, M_DEVBUF); + } + cd->cd_ndevs = new; + cd->cd_devs = nsp; + } + if (cd->cd_devs[dev->dv_unit]) + panic("config_attach: duplicate %s", dev->dv_xname); + cd->cd_devs[dev->dv_unit] = dev; + + /* + * Before attaching, clobber any unfound devices that are + * otherwise identical. + */ + for (cf = cfdata; cf->cf_driver; cf++) + if (cf->cf_driver == cd && cf->cf_unit == dev->dv_unit && + cf->cf_fstate == FSTATE_NOTFOUND) + cf->cf_fstate = FSTATE_FOUND; + (*cd->cd_attach)(parent, dev, aux); +} + +/* + * Attach an event. These must come from initially-zero space (see + * commented-out assignments below), but that occurs naturally for + * device instance variables. + */ +void +evcnt_attach(dev, name, ev) + struct device *dev; + const char *name; + struct evcnt *ev; +{ + static struct evcnt **nextp = &allevents; + +#ifdef DIAGNOSTIC + if (strlen(name) >= sizeof(ev->ev_name)) + panic("evcnt_attach"); +#endif + /* ev->ev_next = NULL; */ + ev->ev_dev = dev; + /* ev->ev_count = 0; */ + strcpy(ev->ev_name, name); + *nextp = ev; + nextp = &ev->ev_next; +} diff --git a/sys/kern/subr_log.c b/sys/kern/subr_log.c new file mode 100644 index 000000000000..792a1cec1b20 --- /dev/null +++ b/sys/kern/subr_log.c @@ -0,0 +1,238 @@ +/* + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)subr_log.c 8.3 (Berkeley) 2/14/95 + */ + +/* + * Error log buffer for kernel printf's. + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/proc.h> +#include <sys/vnode.h> +#include <sys/ioctl.h> +#include <sys/msgbuf.h> +#include <sys/file.h> + +#define LOG_RDPRI (PZERO + 1) + +#define LOG_ASYNC 0x04 +#define LOG_RDWAIT 0x08 + +struct logsoftc { + int sc_state; /* see above for possibilities */ + struct selinfo sc_selp; /* process waiting on select call */ + int sc_pgid; /* process/group for async I/O */ +} logsoftc; + +int log_open; /* also used in log() */ + +/*ARGSUSED*/ +int +logopen(dev, flags, mode, p) + dev_t dev; + int flags, mode; + struct proc *p; +{ + register struct msgbuf *mbp = msgbufp; + + if (log_open) + return (EBUSY); + log_open = 1; + logsoftc.sc_pgid = p->p_pid; /* signal process only */ + /* + * Potential race here with putchar() but since putchar should be + * called by autoconf, msg_magic should be initialized by the time + * we get here. + */ + if (mbp->msg_magic != MSG_MAGIC) { + register int i; + + mbp->msg_magic = MSG_MAGIC; + mbp->msg_bufx = mbp->msg_bufr = 0; + for (i=0; i < MSG_BSIZE; i++) + mbp->msg_bufc[i] = 0; + } + return (0); +} + +/*ARGSUSED*/ +int +logclose(dev, flag, mode, p) + dev_t dev; + int flag, mode; + struct proc *p; +{ + + log_open = 0; + logsoftc.sc_state = 0; + return (0); +} + +/*ARGSUSED*/ +int +logread(dev, uio, flag) + dev_t dev; + struct uio *uio; + int flag; +{ + register struct msgbuf *mbp = msgbufp; + register long l; + register int s; + int error = 0; + + s = splhigh(); + while (mbp->msg_bufr == mbp->msg_bufx) { + if (flag & IO_NDELAY) { + splx(s); + return (EWOULDBLOCK); + } + logsoftc.sc_state |= LOG_RDWAIT; + if (error = tsleep((caddr_t)mbp, LOG_RDPRI | PCATCH, + "klog", 0)) { + splx(s); + return (error); + } + } + splx(s); + logsoftc.sc_state &= ~LOG_RDWAIT; + + while (uio->uio_resid > 0) { + l = mbp->msg_bufx - mbp->msg_bufr; + if (l < 0) + l = MSG_BSIZE - mbp->msg_bufr; + l = min(l, uio->uio_resid); + if (l == 0) + break; + error = uiomove((caddr_t)&mbp->msg_bufc[mbp->msg_bufr], + (int)l, uio); + if (error) + break; + mbp->msg_bufr += l; + if (mbp->msg_bufr < 0 || mbp->msg_bufr >= MSG_BSIZE) + mbp->msg_bufr = 0; + } + return (error); +} + +/*ARGSUSED*/ +int +logselect(dev, rw, p) + dev_t dev; + int rw; + struct proc *p; +{ + int s = splhigh(); + + switch (rw) { + + case FREAD: + if (msgbufp->msg_bufr != msgbufp->msg_bufx) { + splx(s); + return (1); + } + selrecord(p, &logsoftc.sc_selp); + break; + } + splx(s); + return (0); +} + +void +logwakeup() +{ + struct proc *p; + + if (!log_open) + return; + selwakeup(&logsoftc.sc_selp); + if (logsoftc.sc_state & LOG_ASYNC) { + if (logsoftc.sc_pgid < 0) + gsignal(-logsoftc.sc_pgid, SIGIO); + else if (p = pfind(logsoftc.sc_pgid)) + psignal(p, SIGIO); + } + if (logsoftc.sc_state & LOG_RDWAIT) { + wakeup((caddr_t)msgbufp); + logsoftc.sc_state &= ~LOG_RDWAIT; + } +} + +/*ARGSUSED*/ +int +logioctl(dev, com, data, flag, p) + dev_t dev; + u_long com; + caddr_t data; + int flag; + struct proc *p; +{ + long l; + int s; + + switch (com) { + + /* return number of characters immediately available */ + case FIONREAD: + s = splhigh(); + l = msgbufp->msg_bufx - msgbufp->msg_bufr; + splx(s); + if (l < 0) + l += MSG_BSIZE; + *(int *)data = l; + break; + + case FIONBIO: + break; + + case FIOASYNC: + if (*(int *)data) + logsoftc.sc_state |= LOG_ASYNC; + else + logsoftc.sc_state &= ~LOG_ASYNC; + break; + + case TIOCSPGRP: + logsoftc.sc_pgid = *(int *)data; + break; + + case TIOCGPGRP: + *(int *)data = logsoftc.sc_pgid; + break; + + default: + return (-1); + } + return (0); +} diff --git a/sys/kern/subr_prf.c b/sys/kern/subr_prf.c new file mode 100644 index 000000000000..8a9a44edaf39 --- /dev/null +++ b/sys/kern/subr_prf.c @@ -0,0 +1,606 @@ +/*- + * Copyright (c) 1986, 1988, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)subr_prf.c 8.4 (Berkeley) 5/4/95 + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/buf.h> +#include <sys/conf.h> +#include <sys/reboot.h> +#include <sys/msgbuf.h> +#include <sys/proc.h> +#include <sys/ioctl.h> +#include <sys/vnode.h> +#include <sys/file.h> +#include <sys/tty.h> +#include <sys/tprintf.h> +#include <sys/syslog.h> +#include <sys/malloc.h> + +/* + * Note that stdarg.h and the ANSI style va_start macro is used for both + * ANSI and traditional C compilers. + */ +#include <machine/stdarg.h> + +#ifdef KADB +#include <machine/kdbparam.h> +#endif + +#define TOCONS 0x01 +#define TOTTY 0x02 +#define TOLOG 0x04 + +struct tty *constty; /* pointer to console "window" tty */ + +extern cnputc(); /* standard console putc */ +int (*v_putc)() = cnputc; /* routine to putc on virtual console */ + +void logpri __P((int level)); +static void putchar __P((int ch, int flags, struct tty *tp)); +static char *ksprintn __P((u_long num, int base, int *len)); +void kprintf __P((const char *fmt, int flags, struct tty *tp, va_list ap)); + +int consintr = 1; /* Ok to handle console interrupts? */ + +/* + * Variable panicstr contains argument to first call to panic; used as flag + * to indicate that the kernel has already called panic. + */ +const char *panicstr; + +/* + * Panic is called on unresolvable fatal errors. It prints "panic: mesg", + * and then reboots. If we are called twice, then we avoid trying to sync + * the disks as this often leads to recursive panics. + */ +#ifdef __GNUC__ +volatile void boot(int flags); /* boot() does not return */ +volatile /* panic() does not return */ +#endif +void +#ifdef __STDC__ +panic(const char *fmt, ...) +#else +panic(fmt, va_alist) + char *fmt; +#endif +{ + int bootopt; + va_list ap; + + bootopt = RB_AUTOBOOT | RB_DUMP; + if (panicstr) + bootopt |= RB_NOSYNC; + else + panicstr = fmt; + + va_start(ap, fmt); + printf("panic: %r\n", fmt, ap); + va_end(ap); + +#ifdef KGDB + kgdb_panic(); +#endif +#ifdef KADB + if (boothowto & RB_KDB) + kdbpanic(); +#endif + boot(bootopt); +} + +/* + * Warn that a system table is full. + */ +void +tablefull(tab) + const char *tab; +{ + + log(LOG_ERR, "%s: table is full\n", tab); +} + +/* + * Uprintf prints to the controlling terminal for the current process. + * It may block if the tty queue is overfull. No message is printed if + * the queue does not clear in a reasonable time. + */ +void +#ifdef __STDC__ +uprintf(const char *fmt, ...) +#else +uprintf(fmt, va_alist) + char *fmt; +#endif +{ + register struct proc *p = curproc; + va_list ap; + + if (p->p_flag & P_CONTROLT && p->p_session->s_ttyvp) { + va_start(ap, fmt); + kprintf(fmt, TOTTY, p->p_session->s_ttyp, ap); + va_end(ap); + } +} + +tpr_t +tprintf_open(p) + register struct proc *p; +{ + + if (p->p_flag & P_CONTROLT && p->p_session->s_ttyvp) { + SESSHOLD(p->p_session); + return ((tpr_t) p->p_session); + } + return ((tpr_t) NULL); +} + +void +tprintf_close(sess) + tpr_t sess; +{ + + if (sess) + SESSRELE((struct session *) sess); +} + +/* + * tprintf prints on the controlling terminal associated + * with the given session. + */ +void +#ifdef __STDC__ +tprintf(tpr_t tpr, const char *fmt, ...) +#else +tprintf(tpr, fmt, va_alist) + tpr_t tpr; + char *fmt; +#endif +{ + register struct session *sess = (struct session *)tpr; + struct tty *tp = NULL; + int flags = TOLOG; + va_list ap; + + logpri(LOG_INFO); + if (sess && sess->s_ttyvp && ttycheckoutq(sess->s_ttyp, 0)) { + flags |= TOTTY; + tp = sess->s_ttyp; + } + va_start(ap, fmt); + kprintf(fmt, flags, tp, ap); + va_end(ap); + logwakeup(); +} + +/* + * Ttyprintf displays a message on a tty; it should be used only by + * the tty driver, or anything that knows the underlying tty will not + * be revoke(2)'d away. Other callers should use tprintf. + */ +void +#ifdef __STDC__ +ttyprintf(struct tty *tp, const char *fmt, ...) +#else +ttyprintf(tp, fmt, va_alist) + struct tty *tp; + char *fmt; +#endif +{ + va_list ap; + + va_start(ap, fmt); + kprintf(fmt, TOTTY, tp, ap); + va_end(ap); +} + +extern int log_open; + +/* + * Log writes to the log buffer, and guarantees not to sleep (so can be + * called by interrupt routines). If there is no process reading the + * log yet, it writes to the console also. + */ +void +#ifdef __STDC__ +log(int level, const char *fmt, ...) +#else +log(level, fmt, va_alist) + int level; + char *fmt; +#endif +{ + register int s; + va_list ap; + + s = splhigh(); + logpri(level); + va_start(ap, fmt); + kprintf(fmt, TOLOG, NULL, ap); + splx(s); + va_end(ap); + if (!log_open) { + va_start(ap, fmt); + kprintf(fmt, TOCONS, NULL, ap); + va_end(ap); + } + logwakeup(); +} + +void +logpri(level) + int level; +{ + register int ch; + register char *p; + + putchar('<', TOLOG, NULL); + for (p = ksprintn((u_long)level, 10, NULL); ch = *p--;) + putchar(ch, TOLOG, NULL); + putchar('>', TOLOG, NULL); +} + +void +#ifdef __STDC__ +addlog(const char *fmt, ...) +#else +addlog(fmt, va_alist) + char *fmt; +#endif +{ + register int s; + va_list ap; + + s = splhigh(); + va_start(ap, fmt); + kprintf(fmt, TOLOG, NULL, ap); + splx(s); + va_end(ap); + if (!log_open) { + va_start(ap, fmt); + kprintf(fmt, TOCONS, NULL, ap); + va_end(ap); + } + logwakeup(); +} + +void +#ifdef __STDC__ +printf(const char *fmt, ...) +#else +printf(fmt, va_alist) + char *fmt; +#endif +{ + va_list ap; + register int savintr; + + savintr = consintr; /* disable interrupts */ + consintr = 0; + va_start(ap, fmt); + kprintf(fmt, TOCONS | TOLOG, NULL, ap); + va_end(ap); + if (!panicstr) + logwakeup(); + consintr = savintr; /* reenable interrupts */ +} + +/* + * Scaled down version of printf(3). + * + * Two additional formats: + * + * The format %b is supported to decode error registers. + * Its usage is: + * + * printf("reg=%b\n", regval, "<base><arg>*"); + * + * where <base> is the output base expressed as a control character, e.g. + * \10 gives octal; \20 gives hex. Each arg is a sequence of characters, + * the first of which gives the bit number to be inspected (origin 1), and + * the next characters (up to a control character, i.e. a character <= 32), + * give the name of the register. Thus: + * + * kprintf("reg=%b\n", 3, "\10\2BITTWO\1BITONE\n"); + * + * would produce output: + * + * reg=3<BITTWO,BITONE> + * + * The format %r passes an additional format string and argument list + * recursively. Its usage is: + * + * fn(char *fmt, ...) + * { + * va_list ap; + * va_start(ap, fmt); + * printf("prefix: %r: suffix\n", fmt, ap); + * va_end(ap); + * } + * + * Space or zero padding and a field width are supported for the numeric + * formats only. + */ +void +kprintf(fmt, flags, tp, ap) + register const char *fmt; + int flags; + struct tty *tp; + va_list ap; +{ + register char *p, *q; + register int ch, n; + u_long ul; + int base, lflag, tmp, width; + char padc; + + for (;;) { + padc = ' '; + width = 0; + while ((ch = *(u_char *)fmt++) != '%') { + if (ch == '\0') + return; + putchar(ch, flags, tp); + } + lflag = 0; +reswitch: switch (ch = *(u_char *)fmt++) { + case '0': + padc = '0'; + goto reswitch; + case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + for (width = 0;; ++fmt) { + width = width * 10 + ch - '0'; + ch = *fmt; + if (ch < '0' || ch > '9') + break; + } + goto reswitch; + case 'l': + lflag = 1; + goto reswitch; + case 'b': + ul = va_arg(ap, int); + p = va_arg(ap, char *); + for (q = ksprintn(ul, *p++, NULL); ch = *q--;) + putchar(ch, flags, tp); + + if (!ul) + break; + + for (tmp = 0; n = *p++;) { + if (ul & (1 << (n - 1))) { + putchar(tmp ? ',' : '<', flags, tp); + for (; (n = *p) > ' '; ++p) + putchar(n, flags, tp); + tmp = 1; + } else + for (; *p > ' '; ++p) + continue; + } + if (tmp) + putchar('>', flags, tp); + break; + case 'c': + putchar(va_arg(ap, int), flags, tp); + break; + case 'r': + p = va_arg(ap, char *); + kprintf(p, flags, tp, va_arg(ap, va_list)); + break; + case 's': + p = va_arg(ap, char *); + while (ch = *p++) + putchar(ch, flags, tp); + break; + case 'd': + ul = lflag ? va_arg(ap, long) : va_arg(ap, int); + if ((long)ul < 0) { + putchar('-', flags, tp); + ul = -(long)ul; + } + base = 10; + goto number; + case 'o': + ul = lflag ? va_arg(ap, u_long) : va_arg(ap, u_int); + base = 8; + goto number; + case 'u': + ul = lflag ? va_arg(ap, u_long) : va_arg(ap, u_int); + base = 10; + goto number; + case 'x': + ul = lflag ? va_arg(ap, u_long) : va_arg(ap, u_int); + base = 16; +number: p = ksprintn(ul, base, &tmp); + if (width && (width -= tmp) > 0) + while (width--) + putchar(padc, flags, tp); + while (ch = *p--) + putchar(ch, flags, tp); + break; + default: + putchar('%', flags, tp); + if (lflag) + putchar('l', flags, tp); + /* FALLTHROUGH */ + case '%': + putchar(ch, flags, tp); + } + } +} + +/* + * Print a character on console or users terminal. If destination is + * the console then the last MSGBUFS characters are saved in msgbuf for + * inspection later. + */ +static void +putchar(c, flags, tp) + register int c; + int flags; + struct tty *tp; +{ + extern int msgbufmapped; + register struct msgbuf *mbp; + + if (panicstr) + constty = NULL; + if ((flags & TOCONS) && tp == NULL && constty) { + tp = constty; + flags |= TOTTY; + } + if ((flags & TOTTY) && tp && tputchar(c, tp) < 0 && + (flags & TOCONS) && tp == constty) + constty = NULL; + if ((flags & TOLOG) && + c != '\0' && c != '\r' && c != 0177 && msgbufmapped) { + mbp = msgbufp; + if (mbp->msg_magic != MSG_MAGIC) { + bzero((caddr_t)mbp, sizeof(*mbp)); + mbp->msg_magic = MSG_MAGIC; + } + mbp->msg_bufc[mbp->msg_bufx++] = c; + if (mbp->msg_bufx < 0 || mbp->msg_bufx >= MSG_BSIZE) + mbp->msg_bufx = 0; + /* If the buffer is full, keep the most recent data. */ + if (mbp->msg_bufr == mbp->msg_bufx) { + if (++mbp->msg_bufr >= MSG_BSIZE) + mbp->msg_bufr = 0; + } + } + if ((flags & TOCONS) && constty == NULL && c != '\0') + (*v_putc)(c); +} + +/* + * Scaled down version of sprintf(3). + */ +#ifdef __STDC__ +sprintf(char *buf, const char *cfmt, ...) +#else +sprintf(buf, cfmt, va_alist) + char *buf, *cfmt; +#endif +{ + register const char *fmt = cfmt; + register char *p, *bp; + register int ch, base; + u_long ul; + int lflag; + va_list ap; + + va_start(ap, cfmt); + for (bp = buf; ; ) { + while ((ch = *(u_char *)fmt++) != '%') + if ((*bp++ = ch) == '\0') + return ((bp - buf) - 1); + + lflag = 0; +reswitch: switch (ch = *(u_char *)fmt++) { + case 'l': + lflag = 1; + goto reswitch; + case 'c': + *bp++ = va_arg(ap, int); + break; + case 's': + p = va_arg(ap, char *); + while (*bp++ = *p++) + continue; + --bp; + break; + case 'd': + ul = lflag ? va_arg(ap, long) : va_arg(ap, int); + if ((long)ul < 0) { + *bp++ = '-'; + ul = -(long)ul; + } + base = 10; + goto number; + break; + case 'o': + ul = lflag ? va_arg(ap, u_long) : va_arg(ap, u_int); + base = 8; + goto number; + break; + case 'u': + ul = lflag ? va_arg(ap, u_long) : va_arg(ap, u_int); + base = 10; + goto number; + break; + case 'x': + ul = lflag ? va_arg(ap, u_long) : va_arg(ap, u_int); + base = 16; +number: for (p = ksprintn(ul, base, NULL); ch = *p--;) + *bp++ = ch; + break; + default: + *bp++ = '%'; + if (lflag) + *bp++ = 'l'; + /* FALLTHROUGH */ + case '%': + *bp++ = ch; + } + } + va_end(ap); +} + +/* + * Put a number (base <= 16) in a buffer in reverse order; return an + * optional length and a pointer to the NULL terminated (preceded?) + * buffer. + */ +static char * +ksprintn(ul, base, lenp) + register u_long ul; + register int base, *lenp; +{ /* A long in base 8, plus NULL. */ + static char buf[sizeof(long) * NBBY / 3 + 2]; + register char *p; + + p = buf; + do { + *++p = "0123456789abcdef"[ul % base]; + } while (ul /= base); + if (lenp) + *lenp = p - buf; + return (p); +} diff --git a/sys/kern/subr_prof.c b/sys/kern/subr_prof.c new file mode 100644 index 000000000000..237553d7c8cb --- /dev/null +++ b/sys/kern/subr_prof.c @@ -0,0 +1,262 @@ +/*- + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)subr_prof.c 8.4 (Berkeley) 2/14/95 + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/proc.h> +#include <sys/user.h> + +#include <sys/mount.h> +#include <sys/syscallargs.h> + +#include <machine/cpu.h> + +#ifdef GPROF +#include <sys/malloc.h> +#include <sys/gmon.h> + +/* + * Froms is actually a bunch of unsigned shorts indexing tos + */ +struct gmonparam _gmonparam = { GMON_PROF_OFF }; + +extern char etext[]; + +void +kmstartup() +{ + char *cp; + struct gmonparam *p = &_gmonparam; + /* + * Round lowpc and highpc to multiples of the density we're using + * so the rest of the scaling (here and in gprof) stays in ints. + */ + p->lowpc = ROUNDDOWN(KERNBASE, HISTFRACTION * sizeof(HISTCOUNTER)); + p->highpc = ROUNDUP((u_long)etext, HISTFRACTION * sizeof(HISTCOUNTER)); + p->textsize = p->highpc - p->lowpc; + printf("Profiling kernel, textsize=%d [%x..%x]\n", + p->textsize, p->lowpc, p->highpc); + p->kcountsize = p->textsize / HISTFRACTION; + p->hashfraction = HASHFRACTION; + p->fromssize = p->textsize / HASHFRACTION; + p->tolimit = p->textsize * ARCDENSITY / 100; + if (p->tolimit < MINARCS) + p->tolimit = MINARCS; + else if (p->tolimit > MAXARCS) + p->tolimit = MAXARCS; + p->tossize = p->tolimit * sizeof(struct tostruct); + cp = (char *)malloc(p->kcountsize + p->fromssize + p->tossize, + M_GPROF, M_NOWAIT); + if (cp == 0) { + printf("No memory for profiling.\n"); + return; + } + bzero(cp, p->kcountsize + p->tossize + p->fromssize); + p->tos = (struct tostruct *)cp; + cp += p->tossize; + p->kcount = (u_short *)cp; + cp += p->kcountsize; + p->froms = (u_short *)cp; +} + +/* + * Return kernel profiling information. + */ +int +sysctl_doprof(name, namelen, oldp, oldlenp, newp, newlen, p) + int *name; + u_int namelen; + void *oldp; + size_t *oldlenp; + void *newp; + size_t newlen; +{ + struct gmonparam *gp = &_gmonparam; + int error; + + /* all sysctl names at this level are terminal */ + if (namelen != 1) + return (ENOTDIR); /* overloaded */ + + switch (name[0]) { + case GPROF_STATE: + error = sysctl_int(oldp, oldlenp, newp, newlen, &gp->state); + if (error) + return (error); + if (gp->state == GMON_PROF_OFF) + stopprofclock(&proc0); + else + startprofclock(&proc0); + return (0); + case GPROF_COUNT: + return (sysctl_struct(oldp, oldlenp, newp, newlen, + gp->kcount, gp->kcountsize)); + case GPROF_FROMS: + return (sysctl_struct(oldp, oldlenp, newp, newlen, + gp->froms, gp->fromssize)); + case GPROF_TOS: + return (sysctl_struct(oldp, oldlenp, newp, newlen, + gp->tos, gp->tossize)); + case GPROF_GMONPARAM: + return (sysctl_rdstruct(oldp, oldlenp, newp, gp, sizeof *gp)); + default: + return (EOPNOTSUPP); + } + /* NOTREACHED */ +} +#endif /* GPROF */ + +/* + * Profiling system call. + * + * The scale factor is a fixed point number with 16 bits of fraction, so that + * 1.0 is represented as 0x10000. A scale factor of 0 turns off profiling. + */ +/* ARGSUSED */ +int +profil(p, uap, retval) + struct proc *p; + register struct profil_args /* { + syscallarg(caddr_t) samples; + syscallarg(u_int) size; + syscallarg(u_int) offset; + syscallarg(u_int) scale; + } */ *uap; + register_t *retval; +{ + register struct uprof *upp; + int s; + + if (SCARG(uap, scale) > (1 << 16)) + return (EINVAL); + if (SCARG(uap, scale) == 0) { + stopprofclock(p); + return (0); + } + upp = &p->p_stats->p_prof; + + /* Block profile interrupts while changing state. */ + s = splstatclock(); + upp->pr_off = SCARG(uap, offset); + upp->pr_scale = SCARG(uap, scale); + upp->pr_base = SCARG(uap, samples); + upp->pr_size = SCARG(uap, size); + startprofclock(p); + splx(s); + + return (0); +} + +/* + * Scale is a fixed-point number with the binary point 16 bits + * into the value, and is <= 1.0. pc is at most 32 bits, so the + * intermediate result is at most 48 bits. + */ +#define PC_TO_INDEX(pc, prof) \ + ((int)(((u_quad_t)((pc) - (prof)->pr_off) * \ + (u_quad_t)((prof)->pr_scale)) >> 16) & ~1) + +/* + * Collect user-level profiling statistics; called on a profiling tick, + * when a process is running in user-mode. This routine may be called + * from an interrupt context. We try to update the user profiling buffers + * cheaply with fuswintr() and suswintr(). If that fails, we revert to + * an AST that will vector us to trap() with a context in which copyin + * and copyout will work. Trap will then call addupc_task(). + * + * Note that we may (rarely) not get around to the AST soon enough, and + * lose profile ticks when the next tick overwrites this one, but in this + * case the system is overloaded and the profile is probably already + * inaccurate. + */ +void +addupc_intr(p, pc, ticks) + register struct proc *p; + register u_long pc; + u_int ticks; +{ + register struct uprof *prof; + register caddr_t addr; + register u_int i; + register int v; + + if (ticks == 0) + return; + prof = &p->p_stats->p_prof; + if (pc < prof->pr_off || + (i = PC_TO_INDEX(pc, prof)) >= prof->pr_size) + return; /* out of range; ignore */ + + addr = prof->pr_base + i; + if ((v = fuswintr(addr)) == -1 || suswintr(addr, v + ticks) == -1) { + prof->pr_addr = pc; + prof->pr_ticks = ticks; + need_proftick(p); + } +} + +/* + * Much like before, but we can afford to take faults here. If the + * update fails, we simply turn off profiling. + */ +void +addupc_task(p, pc, ticks) + register struct proc *p; + register u_long pc; + u_int ticks; +{ + register struct uprof *prof; + register caddr_t addr; + register u_int i; + u_short v; + + /* Testing P_PROFIL may be unnecessary, but is certainly safe. */ + if ((p->p_flag & P_PROFIL) == 0 || ticks == 0) + return; + + prof = &p->p_stats->p_prof; + if (pc < prof->pr_off || + (i = PC_TO_INDEX(pc, prof)) >= prof->pr_size) + return; + + addr = prof->pr_base + i; + if (copyin(addr, (caddr_t)&v, sizeof(v)) == 0) { + v += ticks; + if (copyout((caddr_t)&v, addr, sizeof(v)) == 0) + return; + } + stopprofclock(p); +} diff --git a/sys/kern/subr_xxx.c b/sys/kern/subr_xxx.c new file mode 100644 index 000000000000..45b2d64619f7 --- /dev/null +++ b/sys/kern/subr_xxx.c @@ -0,0 +1,117 @@ +/* + * Copyright (c) 1982, 1986, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)subr_xxx.c 8.3 (Berkeley) 3/29/95 + */ + +/* + * Miscellaneous trivial functions, including many + * that are often inline-expanded or done in assembler. + */ +#include <sys/param.h> +#include <sys/systm.h> + +#include <machine/cpu.h> + +/* + * Unsupported device function (e.g. writing to read-only device). + */ +int +enodev() +{ + + return (ENODEV); +} + +/* + * Unconfigured device function; driver not configured. + */ +int +enxio() +{ + + return (ENXIO); +} + +/* + * Unsupported ioctl function. + */ +int +enoioctl() +{ + + return (ENOTTY); +} + +/* + * Unsupported system function. + * This is used for an otherwise-reasonable operation + * that is not supported by the current system binary. + */ +int +enosys() +{ + + return (ENOSYS); +} + +/* + * Return error for operation not supported + * on a specific object or file type. + */ +int +eopnotsupp() +{ + + return (EOPNOTSUPP); +} + +/* + * Return error for an inval operation + * on a specific object or file type. + */ +int +einval() +{ + + return (EINVAL); +} + +/* + * Generic null operation, always returns success. + */ +int +nullop() +{ + + return (0); +} diff --git a/sys/kern/sys_generic.c b/sys/kern/sys_generic.c new file mode 100644 index 000000000000..08385b3276e0 --- /dev/null +++ b/sys/kern/sys_generic.c @@ -0,0 +1,690 @@ +/* + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)sys_generic.c 8.9 (Berkeley) 2/14/95 + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/filedesc.h> +#include <sys/ioctl.h> +#include <sys/file.h> +#include <sys/proc.h> +#include <sys/socketvar.h> +#include <sys/uio.h> +#include <sys/kernel.h> +#include <sys/stat.h> +#include <sys/malloc.h> +#ifdef KTRACE +#include <sys/ktrace.h> +#endif + +#include <sys/mount.h> +#include <sys/syscallargs.h> + +/* + * Read system call. + */ +/* ARGSUSED */ +int +read(p, uap, retval) + struct proc *p; + register struct read_args /* { + syscallarg(int) fd; + syscallarg(char *) buf; + syscallarg(u_int) nbyte; + } */ *uap; + register_t *retval; +{ + register struct file *fp; + register struct filedesc *fdp = p->p_fd; + struct uio auio; + struct iovec aiov; + long cnt, error = 0; +#ifdef KTRACE + struct iovec ktriov; +#endif + + if (((u_int)SCARG(uap, fd)) >= fdp->fd_nfiles || + (fp = fdp->fd_ofiles[SCARG(uap, fd)]) == NULL || + (fp->f_flag & FREAD) == 0) + return (EBADF); + aiov.iov_base = (caddr_t)SCARG(uap, buf); + aiov.iov_len = SCARG(uap, nbyte); + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_resid = SCARG(uap, nbyte); + auio.uio_rw = UIO_READ; + auio.uio_segflg = UIO_USERSPACE; + auio.uio_procp = p; +#ifdef KTRACE + /* + * if tracing, save a copy of iovec + */ + if (KTRPOINT(p, KTR_GENIO)) + ktriov = aiov; +#endif + cnt = SCARG(uap, nbyte); + if (error = (*fp->f_ops->fo_read)(fp, &auio, fp->f_cred)) + if (auio.uio_resid != cnt && (error == ERESTART || + error == EINTR || error == EWOULDBLOCK)) + error = 0; + cnt -= auio.uio_resid; +#ifdef KTRACE + if (KTRPOINT(p, KTR_GENIO) && error == 0) + ktrgenio(p->p_tracep, SCARG(uap, fd), UIO_READ, &ktriov, + cnt, error); +#endif + *retval = cnt; + return (error); +} + +/* + * Scatter read system call. + */ +int +readv(p, uap, retval) + struct proc *p; + register struct readv_args /* { + syscallarg(int) fd; + syscallarg(struct iovec *) iovp; + syscallarg(u_int) iovcnt; + } */ *uap; + register_t *retval; +{ + register struct file *fp; + register struct filedesc *fdp = p->p_fd; + struct uio auio; + register struct iovec *iov; + struct iovec *needfree; + struct iovec aiov[UIO_SMALLIOV]; + long i, cnt, error = 0; + u_int iovlen; +#ifdef KTRACE + struct iovec *ktriov = NULL; +#endif + + if (((u_int)SCARG(uap, fd)) >= fdp->fd_nfiles || + (fp = fdp->fd_ofiles[SCARG(uap, fd)]) == NULL || + (fp->f_flag & FREAD) == 0) + return (EBADF); + /* note: can't use iovlen until iovcnt is validated */ + iovlen = SCARG(uap, iovcnt) * sizeof (struct iovec); + if (SCARG(uap, iovcnt) > UIO_SMALLIOV) { + if (SCARG(uap, iovcnt) > UIO_MAXIOV) + return (EINVAL); + MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK); + needfree = iov; + } else { + iov = aiov; + needfree = NULL; + } + auio.uio_iov = iov; + auio.uio_iovcnt = SCARG(uap, iovcnt); + auio.uio_rw = UIO_READ; + auio.uio_segflg = UIO_USERSPACE; + auio.uio_procp = p; + if (error = copyin((caddr_t)SCARG(uap, iovp), (caddr_t)iov, iovlen)) + goto done; + auio.uio_resid = 0; + for (i = 0; i < SCARG(uap, iovcnt); i++) { + if (auio.uio_resid + iov->iov_len < auio.uio_resid) { + error = EINVAL; + goto done; + } + auio.uio_resid += iov->iov_len; + iov++; + } +#ifdef KTRACE + /* + * if tracing, save a copy of iovec + */ + if (KTRPOINT(p, KTR_GENIO)) { + MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK); + bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen); + } +#endif + cnt = auio.uio_resid; + if (error = (*fp->f_ops->fo_read)(fp, &auio, fp->f_cred)) + if (auio.uio_resid != cnt && (error == ERESTART || + error == EINTR || error == EWOULDBLOCK)) + error = 0; + cnt -= auio.uio_resid; +#ifdef KTRACE + if (ktriov != NULL) { + if (error == 0) + ktrgenio(p->p_tracep, SCARG(uap, fd), UIO_READ, ktriov, + cnt, error); + FREE(ktriov, M_TEMP); + } +#endif + *retval = cnt; +done: + if (needfree) + FREE(needfree, M_IOV); + return (error); +} + +/* + * Write system call + */ +int +write(p, uap, retval) + struct proc *p; + register struct write_args /* { + syscallarg(int) fd; + syscallarg(char *) buf; + syscallarg(u_int) nbyte; + } */ *uap; + register_t *retval; +{ + register struct file *fp; + register struct filedesc *fdp = p->p_fd; + struct uio auio; + struct iovec aiov; + long cnt, error = 0; +#ifdef KTRACE + struct iovec ktriov; +#endif + + if (((u_int)SCARG(uap, fd)) >= fdp->fd_nfiles || + (fp = fdp->fd_ofiles[SCARG(uap, fd)]) == NULL || + (fp->f_flag & FWRITE) == 0) + return (EBADF); + aiov.iov_base = (caddr_t)SCARG(uap, buf); + aiov.iov_len = SCARG(uap, nbyte); + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_resid = SCARG(uap, nbyte); + auio.uio_rw = UIO_WRITE; + auio.uio_segflg = UIO_USERSPACE; + auio.uio_procp = p; +#ifdef KTRACE + /* + * if tracing, save a copy of iovec + */ + if (KTRPOINT(p, KTR_GENIO)) + ktriov = aiov; +#endif + cnt = SCARG(uap, nbyte); + if (error = (*fp->f_ops->fo_write)(fp, &auio, fp->f_cred)) { + if (auio.uio_resid != cnt && (error == ERESTART || + error == EINTR || error == EWOULDBLOCK)) + error = 0; + if (error == EPIPE) + psignal(p, SIGPIPE); + } + cnt -= auio.uio_resid; +#ifdef KTRACE + if (KTRPOINT(p, KTR_GENIO) && error == 0) + ktrgenio(p->p_tracep, SCARG(uap, fd), UIO_WRITE, + &ktriov, cnt, error); +#endif + *retval = cnt; + return (error); +} + +/* + * Gather write system call + */ +int +writev(p, uap, retval) + struct proc *p; + register struct writev_args /* { + syscallarg(int) fd; + syscallarg(struct iovec *) iovp; + syscallarg(u_int) iovcnt; + } */ *uap; + register_t *retval; +{ + register struct file *fp; + register struct filedesc *fdp = p->p_fd; + struct uio auio; + register struct iovec *iov; + struct iovec *needfree; + struct iovec aiov[UIO_SMALLIOV]; + long i, cnt, error = 0; + u_int iovlen; +#ifdef KTRACE + struct iovec *ktriov = NULL; +#endif + + if (((u_int)SCARG(uap, fd)) >= fdp->fd_nfiles || + (fp = fdp->fd_ofiles[SCARG(uap, fd)]) == NULL || + (fp->f_flag & FWRITE) == 0) + return (EBADF); + /* note: can't use iovlen until iovcnt is validated */ + iovlen = SCARG(uap, iovcnt) * sizeof (struct iovec); + if (SCARG(uap, iovcnt) > UIO_SMALLIOV) { + if (SCARG(uap, iovcnt) > UIO_MAXIOV) + return (EINVAL); + MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK); + needfree = iov; + } else { + iov = aiov; + needfree = NULL; + } + auio.uio_iov = iov; + auio.uio_iovcnt = SCARG(uap, iovcnt); + auio.uio_rw = UIO_WRITE; + auio.uio_segflg = UIO_USERSPACE; + auio.uio_procp = p; + if (error = copyin((caddr_t)SCARG(uap, iovp), (caddr_t)iov, iovlen)) + goto done; + auio.uio_resid = 0; + for (i = 0; i < SCARG(uap, iovcnt); i++) { + if (auio.uio_resid + iov->iov_len < auio.uio_resid) { + error = EINVAL; + goto done; + } + auio.uio_resid += iov->iov_len; + iov++; + } +#ifdef KTRACE + /* + * if tracing, save a copy of iovec + */ + if (KTRPOINT(p, KTR_GENIO)) { + MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK); + bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen); + } +#endif + cnt = auio.uio_resid; + if (error = (*fp->f_ops->fo_write)(fp, &auio, fp->f_cred)) { + if (auio.uio_resid != cnt && (error == ERESTART || + error == EINTR || error == EWOULDBLOCK)) + error = 0; + if (error == EPIPE) + psignal(p, SIGPIPE); + } + cnt -= auio.uio_resid; +#ifdef KTRACE + if (ktriov != NULL) { + if (error == 0) + ktrgenio(p->p_tracep, SCARG(uap, fd), UIO_WRITE, + ktriov, cnt, error); + FREE(ktriov, M_TEMP); + } +#endif + *retval = cnt; +done: + if (needfree) + FREE(needfree, M_IOV); + return (error); +} + +/* + * Ioctl system call + */ +/* ARGSUSED */ +int +ioctl(p, uap, retval) + struct proc *p; + register struct ioctl_args /* { + syscallarg(int) fd; + syscallarg(u_long) com; + syscallarg(caddr_t) data; + } */ *uap; + register_t *retval; +{ + register struct file *fp; + register struct filedesc *fdp; + register u_long com; + register int error; + register u_int size; + caddr_t data, memp; + int tmp; +#define STK_PARAMS 128 + char stkbuf[STK_PARAMS]; + + fdp = p->p_fd; + if ((u_int)SCARG(uap, fd) >= fdp->fd_nfiles || + (fp = fdp->fd_ofiles[SCARG(uap, fd)]) == NULL) + return (EBADF); + + if ((fp->f_flag & (FREAD | FWRITE)) == 0) + return (EBADF); + + switch (com = SCARG(uap, com)) { + case FIONCLEX: + fdp->fd_ofileflags[SCARG(uap, fd)] &= ~UF_EXCLOSE; + return (0); + case FIOCLEX: + fdp->fd_ofileflags[SCARG(uap, fd)] |= UF_EXCLOSE; + return (0); + } + + /* + * Interpret high order word to find amount of data to be + * copied to/from the user's address space. + */ + size = IOCPARM_LEN(com); + if (size > IOCPARM_MAX) + return (ENOTTY); + memp = NULL; + if (size > sizeof (stkbuf)) { + memp = (caddr_t)malloc((u_long)size, M_IOCTLOPS, M_WAITOK); + data = memp; + } else + data = stkbuf; + if (com&IOC_IN) { + if (size) { + error = copyin(SCARG(uap, data), data, (u_int)size); + if (error) { + if (memp) + free(memp, M_IOCTLOPS); + return (error); + } + } else + *(caddr_t *)data = SCARG(uap, data); + } else if ((com&IOC_OUT) && size) + /* + * Zero the buffer so the user always + * gets back something deterministic. + */ + bzero(data, size); + else if (com&IOC_VOID) + *(caddr_t *)data = SCARG(uap, data); + + switch (com) { + + case FIONBIO: + if (tmp = *(int *)data) + fp->f_flag |= FNONBLOCK; + else + fp->f_flag &= ~FNONBLOCK; + error = (*fp->f_ops->fo_ioctl)(fp, FIONBIO, (caddr_t)&tmp, p); + break; + + case FIOASYNC: + if (tmp = *(int *)data) + fp->f_flag |= FASYNC; + else + fp->f_flag &= ~FASYNC; + error = (*fp->f_ops->fo_ioctl)(fp, FIOASYNC, (caddr_t)&tmp, p); + break; + + case FIOSETOWN: + tmp = *(int *)data; + if (fp->f_type == DTYPE_SOCKET) { + ((struct socket *)fp->f_data)->so_pgid = tmp; + error = 0; + break; + } + if (tmp <= 0) { + tmp = -tmp; + } else { + struct proc *p1 = pfind(tmp); + if (p1 == 0) { + error = ESRCH; + break; + } + tmp = p1->p_pgrp->pg_id; + } + error = (*fp->f_ops->fo_ioctl) + (fp, TIOCSPGRP, (caddr_t)&tmp, p); + break; + + case FIOGETOWN: + if (fp->f_type == DTYPE_SOCKET) { + error = 0; + *(int *)data = ((struct socket *)fp->f_data)->so_pgid; + break; + } + error = (*fp->f_ops->fo_ioctl)(fp, TIOCGPGRP, data, p); + *(int *)data = -*(int *)data; + break; + + default: + error = (*fp->f_ops->fo_ioctl)(fp, com, data, p); + /* + * Copy any data to user, size was + * already set and checked above. + */ + if (error == 0 && (com&IOC_OUT) && size) + error = copyout(data, SCARG(uap, data), (u_int)size); + break; + } + if (memp) + free(memp, M_IOCTLOPS); + return (error); +} + +int selwait, nselcoll; + +/* + * Select system call. + */ +int +select(p, uap, retval) + register struct proc *p; + register struct select_args /* { + syscallarg(u_int) nd; + syscallarg(fd_set *) in; + syscallarg(fd_set *) ou; + syscallarg(fd_set *) ex; + syscallarg(struct timeval *) tv; + } */ *uap; + register_t *retval; +{ + fd_set ibits[3], obits[3]; + struct timeval atv; + int s, ncoll, error, timo = 0; + u_int ni; + + bzero((caddr_t)ibits, sizeof(ibits)); + bzero((caddr_t)obits, sizeof(obits)); + if (SCARG(uap, nd) > FD_SETSIZE) + return (EINVAL); + if (SCARG(uap, nd) > p->p_fd->fd_nfiles) { + /* forgiving; slightly wrong */ + SCARG(uap, nd) = p->p_fd->fd_nfiles; + } + ni = howmany(SCARG(uap, nd), NFDBITS) * sizeof(fd_mask); + +#define getbits(name, x) \ + if (SCARG(uap, name) && (error = copyin((caddr_t)SCARG(uap, name), \ + (caddr_t)&ibits[x], ni))) \ + goto done; + getbits(in, 0); + getbits(ou, 1); + getbits(ex, 2); +#undef getbits + + if (SCARG(uap, tv)) { + error = copyin((caddr_t)SCARG(uap, tv), (caddr_t)&atv, + sizeof (atv)); + if (error) + goto done; + if (itimerfix(&atv)) { + error = EINVAL; + goto done; + } + s = splclock(); + timevaladd(&atv, (struct timeval *)&time); + splx(s); + } +retry: + ncoll = nselcoll; + p->p_flag |= P_SELECT; + error = selscan(p, ibits, obits, SCARG(uap, nd), retval); + if (error || *retval) + goto done; + s = splhigh(); + if (SCARG(uap, tv)) { + if (timercmp(&time, &atv, >=)) { + splx(s); + goto done; + } + /* + * If poll wait was tiny, this could be zero; we will + * have to round it up to avoid sleeping forever. If + * we retry below, the timercmp above will get us out. + * Note that if wait was 0, the timercmp will prevent + * us from getting here the first time. + */ + timo = hzto(&atv); + if (timo == 0) + timo = 1; + } + if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) { + splx(s); + goto retry; + } + p->p_flag &= ~P_SELECT; + error = tsleep((caddr_t)&selwait, PSOCK | PCATCH, "select", timo); + splx(s); + if (error == 0) + goto retry; +done: + p->p_flag &= ~P_SELECT; + /* select is not restarted after signals... */ + if (error == ERESTART) + error = EINTR; + if (error == EWOULDBLOCK) + error = 0; +#define putbits(name, x) \ + if (SCARG(uap, name) && (error2 = copyout((caddr_t)&obits[x], \ + (caddr_t)SCARG(uap, name), ni))) \ + error = error2; + if (error == 0) { + int error2; + + putbits(in, 0); + putbits(ou, 1); + putbits(ex, 2); +#undef putbits + } + return (error); +} + +int +selscan(p, ibits, obits, nfd, retval) + struct proc *p; + fd_set *ibits, *obits; + int nfd; + register_t *retval; +{ + register struct filedesc *fdp = p->p_fd; + register int msk, i, j, fd; + register fd_mask bits; + struct file *fp; + int n = 0; + static int flag[3] = { FREAD, FWRITE, 0 }; + + for (msk = 0; msk < 3; msk++) { + for (i = 0; i < nfd; i += NFDBITS) { + bits = ibits[msk].fds_bits[i/NFDBITS]; + while ((j = ffs(bits)) && (fd = i + --j) < nfd) { + bits &= ~(1 << j); + fp = fdp->fd_ofiles[fd]; + if (fp == NULL) + return (EBADF); + if ((*fp->f_ops->fo_select)(fp, flag[msk], p)) { + FD_SET(fd, &obits[msk]); + n++; + } + } + } + } + *retval = n; + return (0); +} + +/*ARGSUSED*/ +int +seltrue(dev, flag, p) + dev_t dev; + int flag; + struct proc *p; +{ + + return (1); +} + +/* + * Record a select request. + */ +void +selrecord(selector, sip) + struct proc *selector; + struct selinfo *sip; +{ + struct proc *p; + pid_t mypid; + + mypid = selector->p_pid; + if (sip->si_pid == mypid) + return; + if (sip->si_pid && (p = pfind(sip->si_pid)) && + p->p_wchan == (caddr_t)&selwait) + sip->si_flags |= SI_COLL; + else + sip->si_pid = mypid; +} + +/* + * Do a wakeup when a selectable event occurs. + */ +void +selwakeup(sip) + register struct selinfo *sip; +{ + register struct proc *p; + int s; + + if (sip->si_pid == 0) + return; + if (sip->si_flags & SI_COLL) { + nselcoll++; + sip->si_flags &= ~SI_COLL; + wakeup((caddr_t)&selwait); + } + p = pfind(sip->si_pid); + sip->si_pid = 0; + if (p != NULL) { + s = splhigh(); + if (p->p_wchan == (caddr_t)&selwait) { + if (p->p_stat == SSLEEP) + setrunnable(p); + else + unsleep(p); + } else if (p->p_flag & P_SELECT) + p->p_flag &= ~P_SELECT; + splx(s); + } +} diff --git a/sys/kern/sys_socket.c b/sys/kern/sys_socket.c new file mode 100644 index 000000000000..abc2dc75ec8f --- /dev/null +++ b/sys/kern/sys_socket.c @@ -0,0 +1,203 @@ +/* + * Copyright (c) 1982, 1986, 1990, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)sys_socket.c 8.3 (Berkeley) 2/14/95 + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/proc.h> +#include <sys/file.h> +#include <sys/mbuf.h> +#include <sys/protosw.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/ioctl.h> +#include <sys/stat.h> + +#include <net/if.h> +#include <net/route.h> + +struct fileops socketops = + { soo_read, soo_write, soo_ioctl, soo_select, soo_close }; + +/* ARGSUSED */ +int +soo_read(fp, uio, cred) + struct file *fp; + struct uio *uio; + struct ucred *cred; +{ + + return (soreceive((struct socket *)fp->f_data, (struct mbuf **)0, + uio, (struct mbuf **)0, (struct mbuf **)0, (int *)0)); +} + +/* ARGSUSED */ +int +soo_write(fp, uio, cred) + struct file *fp; + struct uio *uio; + struct ucred *cred; +{ + + return (sosend((struct socket *)fp->f_data, (struct mbuf *)0, + uio, (struct mbuf *)0, (struct mbuf *)0, 0)); +} + +int +soo_ioctl(fp, cmd, data, p) + struct file *fp; + u_long cmd; + register caddr_t data; + struct proc *p; +{ + register struct socket *so = (struct socket *)fp->f_data; + + switch (cmd) { + + case FIONBIO: + if (*(int *)data) + so->so_state |= SS_NBIO; + else + so->so_state &= ~SS_NBIO; + return (0); + + case FIOASYNC: + if (*(int *)data) { + so->so_state |= SS_ASYNC; + so->so_rcv.sb_flags |= SB_ASYNC; + so->so_snd.sb_flags |= SB_ASYNC; + } else { + so->so_state &= ~SS_ASYNC; + so->so_rcv.sb_flags &= ~SB_ASYNC; + so->so_snd.sb_flags &= ~SB_ASYNC; + } + return (0); + + case FIONREAD: + *(int *)data = so->so_rcv.sb_cc; + return (0); + + case SIOCSPGRP: + so->so_pgid = *(int *)data; + return (0); + + case SIOCGPGRP: + *(int *)data = so->so_pgid; + return (0); + + case SIOCATMARK: + *(int *)data = (so->so_state&SS_RCVATMARK) != 0; + return (0); + } + /* + * Interface/routing/protocol specific ioctls: + * interface and routing ioctls should have a + * different entry since a socket's unnecessary + */ + if (IOCGROUP(cmd) == 'i') + return (ifioctl(so, cmd, data, p)); + if (IOCGROUP(cmd) == 'r') + return (rtioctl(cmd, data, p)); + return ((*so->so_proto->pr_usrreq)(so, PRU_CONTROL, + (struct mbuf *)cmd, (struct mbuf *)data, (struct mbuf *)0)); +} + +int +soo_select(fp, which, p) + struct file *fp; + int which; + struct proc *p; +{ + register struct socket *so = (struct socket *)fp->f_data; + register int s = splnet(); + + switch (which) { + + case FREAD: + if (soreadable(so)) { + splx(s); + return (1); + } + selrecord(p, &so->so_rcv.sb_sel); + so->so_rcv.sb_flags |= SB_SEL; + break; + + case FWRITE: + if (sowriteable(so)) { + splx(s); + return (1); + } + selrecord(p, &so->so_snd.sb_sel); + so->so_snd.sb_flags |= SB_SEL; + break; + + case 0: + if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) { + splx(s); + return (1); + } + selrecord(p, &so->so_rcv.sb_sel); + so->so_rcv.sb_flags |= SB_SEL; + break; + } + splx(s); + return (0); +} + +int +soo_stat(so, ub) + register struct socket *so; + register struct stat *ub; +{ + + bzero((caddr_t)ub, sizeof (*ub)); + ub->st_mode = S_IFSOCK; + return ((*so->so_proto->pr_usrreq)(so, PRU_SENSE, + (struct mbuf *)ub, (struct mbuf *)0, + (struct mbuf *)0)); +} + +/* ARGSUSED */ +int +soo_close(fp, p) + struct file *fp; + struct proc *p; +{ + int error = 0; + + if (fp->f_data) + error = soclose((struct socket *)fp->f_data); + fp->f_data = 0; + return (error); +} diff --git a/sys/kern/syscalls.c b/sys/kern/syscalls.c new file mode 100644 index 000000000000..91cbdc937f8d --- /dev/null +++ b/sys/kern/syscalls.c @@ -0,0 +1,279 @@ +/* + * System call names. + * + * DO NOT EDIT-- this file is automatically generated. + * created from @(#)syscalls.master 8.6 (Berkeley) 3/30/95 + */ + +char *syscallnames[] = { + "syscall", /* 0 = syscall */ + "exit", /* 1 = exit */ + "fork", /* 2 = fork */ + "read", /* 3 = read */ + "write", /* 4 = write */ + "open", /* 5 = open */ + "close", /* 6 = close */ + "wait4", /* 7 = wait4 */ + "compat_43_creat", /* 8 = compat_43 creat */ + "link", /* 9 = link */ + "unlink", /* 10 = unlink */ + "#11 (obsolete execv)", /* 11 = obsolete execv */ + "chdir", /* 12 = chdir */ + "fchdir", /* 13 = fchdir */ + "mknod", /* 14 = mknod */ + "chmod", /* 15 = chmod */ + "chown", /* 16 = chown */ + "break", /* 17 = break */ + "getfsstat", /* 18 = getfsstat */ + "compat_43_lseek", /* 19 = compat_43 lseek */ + "getpid", /* 20 = getpid */ + "mount", /* 21 = mount */ + "unmount", /* 22 = unmount */ + "setuid", /* 23 = setuid */ + "getuid", /* 24 = getuid */ + "geteuid", /* 25 = geteuid */ + "ptrace", /* 26 = ptrace */ + "recvmsg", /* 27 = recvmsg */ + "sendmsg", /* 28 = sendmsg */ + "recvfrom", /* 29 = recvfrom */ + "accept", /* 30 = accept */ + "getpeername", /* 31 = getpeername */ + "getsockname", /* 32 = getsockname */ + "access", /* 33 = access */ + "chflags", /* 34 = chflags */ + "fchflags", /* 35 = fchflags */ + "sync", /* 36 = sync */ + "kill", /* 37 = kill */ + "compat_43_stat", /* 38 = compat_43 stat */ + "getppid", /* 39 = getppid */ + "compat_43_lstat", /* 40 = compat_43 lstat */ + "dup", /* 41 = dup */ + "pipe", /* 42 = pipe */ + "getegid", /* 43 = getegid */ + "profil", /* 44 = profil */ +#ifdef KTRACE + "ktrace", /* 45 = ktrace */ +#else + "#45 (unimplemented ktrace)", /* 45 = unimplemented ktrace */ +#endif + "sigaction", /* 46 = sigaction */ + "getgid", /* 47 = getgid */ + "sigprocmask", /* 48 = sigprocmask */ + "getlogin", /* 49 = getlogin */ + "setlogin", /* 50 = setlogin */ + "acct", /* 51 = acct */ + "sigpending", /* 52 = sigpending */ + "sigaltstack", /* 53 = sigaltstack */ + "ioctl", /* 54 = ioctl */ + "reboot", /* 55 = reboot */ + "revoke", /* 56 = revoke */ + "symlink", /* 57 = symlink */ + "readlink", /* 58 = readlink */ + "execve", /* 59 = execve */ + "umask", /* 60 = umask */ + "chroot", /* 61 = chroot */ + "compat_43_fstat", /* 62 = compat_43 fstat */ + "compat_43_getkerninfo", /* 63 = compat_43 getkerninfo */ + "compat_43_getpagesize", /* 64 = compat_43 getpagesize */ + "msync", /* 65 = msync */ + "vfork", /* 66 = vfork */ + "#67 (obsolete vread)", /* 67 = obsolete vread */ + "#68 (obsolete vwrite)", /* 68 = obsolete vwrite */ + "sbrk", /* 69 = sbrk */ + "sstk", /* 70 = sstk */ + "compat_43_mmap", /* 71 = compat_43 mmap */ + "vadvise", /* 72 = vadvise */ + "munmap", /* 73 = munmap */ + "mprotect", /* 74 = mprotect */ + "madvise", /* 75 = madvise */ + "#76 (obsolete vhangup)", /* 76 = obsolete vhangup */ + "#77 (obsolete vlimit)", /* 77 = obsolete vlimit */ + "mincore", /* 78 = mincore */ + "getgroups", /* 79 = getgroups */ + "setgroups", /* 80 = setgroups */ + "getpgrp", /* 81 = getpgrp */ + "setpgid", /* 82 = setpgid */ + "setitimer", /* 83 = setitimer */ + "compat_43_wait", /* 84 = compat_43 wait */ + "swapon", /* 85 = swapon */ + "getitimer", /* 86 = getitimer */ + "compat_43_gethostname", /* 87 = compat_43 gethostname */ + "compat_43_sethostname", /* 88 = compat_43 sethostname */ + "getdtablesize", /* 89 = getdtablesize */ + "dup2", /* 90 = dup2 */ + "#91 (unimplemented getdopt)", /* 91 = unimplemented getdopt */ + "fcntl", /* 92 = fcntl */ + "select", /* 93 = select */ + "#94 (unimplemented setdopt)", /* 94 = unimplemented setdopt */ + "fsync", /* 95 = fsync */ + "setpriority", /* 96 = setpriority */ + "socket", /* 97 = socket */ + "connect", /* 98 = connect */ + "compat_43_accept", /* 99 = compat_43 accept */ + "getpriority", /* 100 = getpriority */ + "compat_43_send", /* 101 = compat_43 send */ + "compat_43_recv", /* 102 = compat_43 recv */ + "sigreturn", /* 103 = sigreturn */ + "bind", /* 104 = bind */ + "setsockopt", /* 105 = setsockopt */ + "listen", /* 106 = listen */ + "#107 (obsolete vtimes)", /* 107 = obsolete vtimes */ + "compat_43_sigvec", /* 108 = compat_43 sigvec */ + "compat_43_sigblock", /* 109 = compat_43 sigblock */ + "compat_43_sigsetmask", /* 110 = compat_43 sigsetmask */ + "sigsuspend", /* 111 = sigsuspend */ + "compat_43_sigstack", /* 112 = compat_43 sigstack */ + "compat_43_recvmsg", /* 113 = compat_43 recvmsg */ + "compat_43_sendmsg", /* 114 = compat_43 sendmsg */ +#ifdef TRACE + "vtrace", /* 115 = vtrace */ +#else + "#115 (obsolete vtrace)", /* 115 = obsolete vtrace */ +#endif + "gettimeofday", /* 116 = gettimeofday */ + "getrusage", /* 117 = getrusage */ + "getsockopt", /* 118 = getsockopt */ +#ifdef vax + "resuba", /* 119 = resuba */ +#else + "#119 (unimplemented resuba)", /* 119 = unimplemented resuba */ +#endif + "readv", /* 120 = readv */ + "writev", /* 121 = writev */ + "settimeofday", /* 122 = settimeofday */ + "fchown", /* 123 = fchown */ + "fchmod", /* 124 = fchmod */ + "compat_43_recvfrom", /* 125 = compat_43 recvfrom */ + "compat_43_setreuid", /* 126 = compat_43 setreuid */ + "compat_43_setregid", /* 127 = compat_43 setregid */ + "rename", /* 128 = rename */ + "compat_43_truncate", /* 129 = compat_43 truncate */ + "compat_43_ftruncate", /* 130 = compat_43 ftruncate */ + "flock", /* 131 = flock */ + "mkfifo", /* 132 = mkfifo */ + "sendto", /* 133 = sendto */ + "shutdown", /* 134 = shutdown */ + "socketpair", /* 135 = socketpair */ + "mkdir", /* 136 = mkdir */ + "rmdir", /* 137 = rmdir */ + "utimes", /* 138 = utimes */ + "#139 (obsolete 4.2 sigreturn)", /* 139 = obsolete 4.2 sigreturn */ + "adjtime", /* 140 = adjtime */ + "compat_43_getpeername", /* 141 = compat_43 getpeername */ + "compat_43_gethostid", /* 142 = compat_43 gethostid */ + "compat_43_sethostid", /* 143 = compat_43 sethostid */ + "compat_43_getrlimit", /* 144 = compat_43 getrlimit */ + "compat_43_setrlimit", /* 145 = compat_43 setrlimit */ + "compat_43_killpg", /* 146 = compat_43 killpg */ + "setsid", /* 147 = setsid */ + "quotactl", /* 148 = quotactl */ + "compat_43_quota", /* 149 = compat_43 quota */ + "compat_43_getsockname", /* 150 = compat_43 getsockname */ + "#151 (unimplemented)", /* 151 = unimplemented */ + "#152 (unimplemented)", /* 152 = unimplemented */ + "#153 (unimplemented)", /* 153 = unimplemented */ + "#154 (unimplemented)", /* 154 = unimplemented */ +#ifdef NFS + "nfssvc", /* 155 = nfssvc */ +#else + "#155 (unimplemented nfssvc)", /* 155 = unimplemented nfssvc */ +#endif + "compat_43_getdirentries", /* 156 = compat_43 getdirentries */ + "statfs", /* 157 = statfs */ + "fstatfs", /* 158 = fstatfs */ + "#159 (unimplemented)", /* 159 = unimplemented */ + "#160 (unimplemented)", /* 160 = unimplemented */ +#ifdef NFS + "getfh", /* 161 = getfh */ +#else + "#161 (unimplemented getfh)", /* 161 = unimplemented getfh */ +#endif + "#162 (unimplemented getdomainname)", /* 162 = unimplemented getdomainname */ + "#163 (unimplemented setdomainname)", /* 163 = unimplemented setdomainname */ + "#164 (unimplemented)", /* 164 = unimplemented */ + "#165 (unimplemented)", /* 165 = unimplemented */ + "#166 (unimplemented)", /* 166 = unimplemented */ + "#167 (unimplemented)", /* 167 = unimplemented */ + "#168 (unimplemented)", /* 168 = unimplemented */ + "#169 (unimplemented semsys)", /* 169 = unimplemented semsys */ + "#170 (unimplemented msgsys)", /* 170 = unimplemented msgsys */ +#if defined(SYSVSHM) && !defined(alpha) + "compat_43_shmsys", /* 171 = compat_43 shmsys */ +#else + "#171 (unimplemented shmsys)", /* 171 = unimplemented shmsys */ +#endif + "#172 (unimplemented)", /* 172 = unimplemented */ + "#173 (unimplemented)", /* 173 = unimplemented */ + "#174 (unimplemented)", /* 174 = unimplemented */ + "#175 (unimplemented)", /* 175 = unimplemented */ + "#176 (unimplemented)", /* 176 = unimplemented */ + "#177 (unimplemented)", /* 177 = unimplemented */ + "#178 (unimplemented)", /* 178 = unimplemented */ + "#179 (unimplemented)", /* 179 = unimplemented */ + "#180 (unimplemented)", /* 180 = unimplemented */ + "setgid", /* 181 = setgid */ + "setegid", /* 182 = setegid */ + "seteuid", /* 183 = seteuid */ +#ifdef LFS + "lfs_bmapv", /* 184 = lfs_bmapv */ + "lfs_markv", /* 185 = lfs_markv */ + "lfs_segclean", /* 186 = lfs_segclean */ + "lfs_segwait", /* 187 = lfs_segwait */ +#else + "#184 (unimplemented lfs_bmapv)", /* 184 = unimplemented lfs_bmapv */ + "#185 (unimplemented lfs_markv)", /* 185 = unimplemented lfs_markv */ + "#186 (unimplemented lfs_segclean)", /* 186 = unimplemented lfs_segclean */ + "#187 (unimplemented lfs_segwait)", /* 187 = unimplemented lfs_segwait */ +#endif + "stat", /* 188 = stat */ + "fstat", /* 189 = fstat */ + "lstat", /* 190 = lstat */ + "pathconf", /* 191 = pathconf */ + "fpathconf", /* 192 = fpathconf */ + "#193 (unimplemented)", /* 193 = unimplemented */ + "getrlimit", /* 194 = getrlimit */ + "setrlimit", /* 195 = setrlimit */ + "getdirentries", /* 196 = getdirentries */ + "mmap", /* 197 = mmap */ + "__syscall", /* 198 = __syscall */ + "lseek", /* 199 = lseek */ + "truncate", /* 200 = truncate */ + "ftruncate", /* 201 = ftruncate */ + "__sysctl", /* 202 = __sysctl */ + "mlock", /* 203 = mlock */ + "munlock", /* 204 = munlock */ + "undelete", /* 205 = undelete */ + "#206 (unimplemented)", /* 206 = unimplemented */ + "#207 (unimplemented)", /* 207 = unimplemented */ + "#208 (unimplemented)", /* 208 = unimplemented */ + "#209 (unimplemented)", /* 209 = unimplemented */ + "#210 (unimplemented)", /* 210 = unimplemented */ + "#211 (unimplemented)", /* 211 = unimplemented */ + "#212 (unimplemented)", /* 212 = unimplemented */ + "#213 (unimplemented)", /* 213 = unimplemented */ + "#214 (unimplemented)", /* 214 = unimplemented */ + "#215 (unimplemented)", /* 215 = unimplemented */ + "#216 (unimplemented)", /* 216 = unimplemented */ + "#217 (unimplemented)", /* 217 = unimplemented */ + "#218 (unimplemented)", /* 218 = unimplemented */ + "#219 (unimplemented)", /* 219 = unimplemented */ + "#220 (unimplemented semctl)", /* 220 = unimplemented semctl */ + "#221 (unimplemented semget)", /* 221 = unimplemented semget */ + "#222 (unimplemented semop)", /* 222 = unimplemented semop */ + "#223 (unimplemented semconfig)", /* 223 = unimplemented semconfig */ + "#224 (unimplemented msgctl)", /* 224 = unimplemented msgctl */ + "#225 (unimplemented msgget)", /* 225 = unimplemented msgget */ + "#226 (unimplemented msgsnd)", /* 226 = unimplemented msgsnd */ + "#227 (unimplemented msgrcv)", /* 227 = unimplemented msgrcv */ +#if defined(SYSVSHM) && 0 + "shmat", /* 228 = shmat */ + "shmctl", /* 229 = shmctl */ + "shmdt", /* 230 = shmdt */ + "shmget", /* 231 = shmget */ +#else + "#228 (unimplemented shmat)", /* 228 = unimplemented shmat */ + "#229 (unimplemented shmctl)", /* 229 = unimplemented shmctl */ + "#230 (unimplemented shmdt)", /* 230 = unimplemented shmdt */ + "#231 (unimplemented shmget)", /* 231 = unimplemented shmget */ +#endif +}; diff --git a/sys/kern/syscalls.conf b/sys/kern/syscalls.conf new file mode 100644 index 000000000000..71b82ceff152 --- /dev/null +++ b/sys/kern/syscalls.conf @@ -0,0 +1,12 @@ +# @(#)syscalls.conf 8.1 (Berkeley) 2/14/95 + +sysnames="syscalls.c" +sysnumhdr="../sys/syscall.h" +syssw="init_sysent.c" +sysarghdr="../sys/syscallargs.h" +compatopts="compat_43" +libcompatopts="" + +switchname="sysent" +namesname="syscallnames" +constprefix="SYS_" diff --git a/sys/kern/syscalls.master b/sys/kern/syscalls.master new file mode 100644 index 000000000000..b57cd73bde30 --- /dev/null +++ b/sys/kern/syscalls.master @@ -0,0 +1,355 @@ + @(#)syscalls.master 8.6 (Berkeley) 3/30/95 +; System call name/number "master" file. +; (See syscalls.conf to see what it is processed into.) +; +; Fields: number type [type-dependent ...] +; number system call number, must be in order +; type one of STD, OBSOL, UNIMPL, NODEF, NOARGS, or one of +; the compatibility options defined in syscalls.conf. +; +; types: +; STD always included +; OBSOL obsolete, not included in system +; UNIMPL unimplemented, not included in system +; NODEF included, but don't define the syscall number +; NOARGS included, but don't define the syscall args structure +; +; The compat options are defined in the syscalls.conf file, and the +; compat option name is prefixed to the syscall name. Other than +; that, they're like NODEF (for 'compat' options), or STD (for +; 'libcompat' options). +; +; The type-dependent arguments are as follows: +; For STD, NODEF, NOARGS, and compat syscalls: +; { pseudo-proto } [alias] +; For other syscalls: +; [comment] +; +; #ifdef's, etc. may be included, and are copied to the output files. +; #include's are copied to the syscall switch definition file only. + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/signal.h> +#include <sys/mount.h> +#include <sys/syscallargs.h> + +; Reserved/unimplemented system calls in the range 0-150 inclusive +; are reserved for use in future Berkeley releases. +; Additional system calls implemented in vendor and other +; redistributions should be placed in the reserved range at the end +; of the current calls. + +0 STD { int nosys(void); } syscall +1 STD { int exit(int rval); } +2 STD { int fork(void); } +3 STD { int read(int fd, char *buf, u_int nbyte); } +4 STD { int write(int fd, char *buf, u_int nbyte); } +5 STD { int open(char *path, int flags, int mode); } +6 STD { int close(int fd); } +7 STD { int wait4(int pid, int *status, int options, \ + struct rusage *rusage); } +8 COMPAT_43 { int creat(char *path, int mode); } +9 STD { int link(char *path, char *link); } +10 STD { int unlink(char *path); } +11 OBSOL execv +12 STD { int chdir(char *path); } +13 STD { int fchdir(int fd); } +14 STD { int mknod(char *path, int mode, int dev); } +15 STD { int chmod(char *path, int mode); } +16 STD { int chown(char *path, int uid, int gid); } +17 STD { int obreak(char *nsize); } break +18 STD { int getfsstat(struct statfs *buf, long bufsize, \ + int flags); } +19 COMPAT_43 { long lseek(int fd, long offset, int whence); } +20 STD { pid_t getpid(void); } +21 STD { int mount(char *type, char *path, int flags, \ + caddr_t data); } +22 STD { int unmount(char *path, int flags); } +23 STD { int setuid(uid_t uid); } +24 STD { uid_t getuid(void); } +25 STD { uid_t geteuid(void); } +26 STD { int ptrace(int req, pid_t pid, caddr_t addr, \ + int data); } +27 STD { int recvmsg(int s, struct msghdr *msg, int flags); } +28 STD { int sendmsg(int s, caddr_t msg, int flags); } +29 STD { int recvfrom(int s, caddr_t buf, size_t len, \ + int flags, caddr_t from, int *fromlenaddr); } +30 STD { int accept(int s, caddr_t name, int *anamelen); } +31 STD { int getpeername(int fdes, caddr_t asa, int *alen); } +32 STD { int getsockname(int fdes, caddr_t asa, int *alen); } +33 STD { int access(char *path, int flags); } +34 STD { int chflags(char *path, int flags); } +35 STD { int fchflags(int fd, int flags); } +36 STD { int sync(void); } +37 STD { int kill(int pid, int signum); } +38 COMPAT_43 { int stat(char *path, struct ostat *ub); } +39 STD { pid_t getppid(void); } +40 COMPAT_43 { int lstat(char *path, struct ostat *ub); } +41 STD { int dup(u_int fd); } +42 STD { int pipe(void); } +43 STD { gid_t getegid(void); } +44 STD { int profil(caddr_t samples, u_int size, \ + u_int offset, u_int scale); } +#ifdef KTRACE +45 STD { int ktrace(char *fname, int ops, int facs, \ + int pid); } +#else +45 UNIMPL ktrace +#endif +46 STD { int sigaction(int signum, struct sigaction *nsa, \ + struct sigaction *osa); } +47 STD { gid_t getgid(void); } +48 STD { int sigprocmask(int how, sigset_t mask); } +49 STD { int getlogin(char *namebuf, u_int namelen); } +50 STD { int setlogin(char *namebuf); } +51 STD { int acct(char *path); } +52 STD { int sigpending(void); } +53 STD { int sigaltstack(struct sigaltstack *nss, \ + struct sigaltstack *oss); } +54 STD { int ioctl(int fd, u_long com, caddr_t data); } +55 STD { int reboot(int opt); } +56 STD { int revoke(char *path); } +57 STD { int symlink(char *path, char *link); } +58 STD { int readlink(char *path, char *buf, int count); } +59 STD { int execve(char *path, char **argp, char **envp); } +60 STD { int umask(int newmask); } +61 STD { int chroot(char *path); } +62 COMPAT_43 { int fstat(int fd, struct ostat *sb); } +63 COMPAT_43 { int getkerninfo(int op, char *where, int *size, \ + int arg); } +64 COMPAT_43 { int getpagesize(void); } +65 STD { int msync(caddr_t addr, int len); } +66 STD { int vfork(void); } +67 OBSOL vread +68 OBSOL vwrite +69 STD { int sbrk(int incr); } +70 STD { int sstk(int incr); } +71 COMPAT_43 { int mmap(caddr_t addr, int len, int prot, \ + int flags, int fd, long pos); } +72 STD { int ovadvise(int anom); } vadvise +73 STD { int munmap(caddr_t addr, int len); } +74 STD { int mprotect(caddr_t addr, int len, int prot); } +75 STD { int madvise(caddr_t addr, int len, int behav); } +76 OBSOL vhangup +77 OBSOL vlimit +78 STD { int mincore(caddr_t addr, int len, char *vec); } +79 STD { int getgroups(u_int gidsetsize, gid_t *gidset); } +80 STD { int setgroups(u_int gidsetsize, gid_t *gidset); } +81 STD { int getpgrp(void); } +82 STD { int setpgid(int pid, int pgid); } +83 STD { int setitimer(u_int which, struct itimerval *itv, \ + struct itimerval *oitv); } +84 COMPAT_43 { int wait(void); } +85 STD { int swapon(char *name); } +86 STD { int getitimer(u_int which, struct itimerval *itv); } +87 COMPAT_43 { int gethostname(char *hostname, u_int len); } +88 COMPAT_43 { int sethostname(char *hostname, u_int len); } +89 STD { int getdtablesize(void); } +90 STD { int dup2(u_int from, u_int to); } +91 UNIMPL getdopt +92 STD { int fcntl(int fd, int cmd, void *arg); } +93 STD { int select(u_int nd, fd_set *in, fd_set *ou, \ + fd_set *ex, struct timeval *tv); } +94 UNIMPL setdopt +95 STD { int fsync(int fd); } +96 STD { int setpriority(int which, int who, int prio); } +97 STD { int socket(int domain, int type, int protocol); } +98 STD { int connect(int s, caddr_t name, int namelen); } +99 COMPAT_43 { int accept(int s, caddr_t name, int *anamelen); } +100 STD { int getpriority(int which, int who); } +101 COMPAT_43 { int send(int s, caddr_t buf, int len, int flags); } +102 COMPAT_43 { int recv(int s, caddr_t buf, int len, int flags); } +103 STD { int sigreturn(struct sigcontext *sigcntxp); } +104 STD { int bind(int s, caddr_t name, int namelen); } +105 STD { int setsockopt(int s, int level, int name, \ + caddr_t val, int valsize); } +106 STD { int listen(int s, int backlog); } +107 OBSOL vtimes +108 COMPAT_43 { int sigvec(int signum, struct sigvec *nsv, \ + struct sigvec *osv); } +109 COMPAT_43 { int sigblock(int mask); } +110 COMPAT_43 { int sigsetmask(int mask); } +111 STD { int sigsuspend(int mask); } +112 COMPAT_43 { int sigstack(struct sigstack *nss, \ + struct sigstack *oss); } +113 COMPAT_43 { int recvmsg(int s, struct omsghdr *msg, int flags); } +114 COMPAT_43 { int sendmsg(int s, caddr_t msg, int flags); } +#ifdef TRACE +115 STD { int vtrace(int request, int value); } +#else +115 OBSOL vtrace +#endif +116 STD { int gettimeofday(struct timeval *tp, \ + struct timezone *tzp); } +117 STD { int getrusage(int who, struct rusage *rusage); } +118 STD { int getsockopt(int s, int level, int name, \ + caddr_t val, int *avalsize); } +#ifdef vax +119 STD { int resuba(int value); } +#else +119 UNIMPL resuba +#endif +120 STD { int readv(int fd, struct iovec *iovp, u_int iovcnt); } +121 STD { int writev(int fd, struct iovec *iovp, \ + u_int iovcnt); } +122 STD { int settimeofday(struct timeval *tv, \ + struct timezone *tzp); } +123 STD { int fchown(int fd, int uid, int gid); } +124 STD { int fchmod(int fd, int mode); } +125 COMPAT_43 { int recvfrom(int s, caddr_t buf, size_t len, \ + int flags, caddr_t from, int *fromlenaddr); } +126 COMPAT_43 { int setreuid(int ruid, int euid); } +127 COMPAT_43 { int setregid(int rgid, int egid); } +128 STD { int rename(char *from, char *to); } +129 COMPAT_43 { int truncate(char *path, long length); } +130 COMPAT_43 { int ftruncate(int fd, long length); } +131 STD { int flock(int fd, int how); } +132 STD { int mkfifo(char *path, int mode); } +133 STD { int sendto(int s, caddr_t buf, size_t len, \ + int flags, caddr_t to, int tolen); } +134 STD { int shutdown(int s, int how); } +135 STD { int socketpair(int domain, int type, int protocol, \ + int *rsv); } +136 STD { int mkdir(char *path, int mode); } +137 STD { int rmdir(char *path); } +138 STD { int utimes(char *path, struct timeval *tptr); } +139 OBSOL 4.2 sigreturn +140 STD { int adjtime(struct timeval *delta, \ + struct timeval *olddelta); } +141 COMPAT_43 { int getpeername(int fdes, caddr_t asa, int *alen); } +142 COMPAT_43 { int32_t gethostid(void); } +143 COMPAT_43 { int sethostid(int32_t hostid); } +144 COMPAT_43 { int getrlimit(u_int which, struct ogetrlimit *rlp); } +145 COMPAT_43 { int setrlimit(u_int which, struct ogetrlimit *rlp); } +146 COMPAT_43 { int killpg(int pgid, int signum); } +147 STD { int setsid(void); } +148 STD { int quotactl(char *path, int cmd, int uid, \ + caddr_t arg); } +149 COMPAT_43 { int quota(void); } +150 COMPAT_43 { int getsockname(int fdec, caddr_t asa, int *alen); } + +; Syscalls 151-180 inclusive are reserved for vendor-specific +; system calls. (This includes various calls added for compatibity +; with other Unix variants.) +; Some of these calls are now supported by BSD... +151 UNIMPL +152 UNIMPL +153 UNIMPL +154 UNIMPL +#ifdef NFS +155 STD { int nfssvc(int flag, caddr_t argp); } +#else +155 UNIMPL nfssvc +#endif +156 COMPAT_43 { int getdirentries(int fd, char *buf, u_int count, \ + long *basep); } +157 STD { int statfs(char *path, struct statfs *buf); } +158 STD { int fstatfs(int fd, struct statfs *buf); } +159 UNIMPL +160 UNIMPL +#ifdef NFS +161 STD { int getfh(char *fname, fhandle_t *fhp); } +#else +161 UNIMPL getfh +#endif +162 UNIMPL getdomainname +163 UNIMPL setdomainname +164 UNIMPL +165 UNIMPL +166 UNIMPL +167 UNIMPL +168 UNIMPL +169 UNIMPL semsys +170 UNIMPL msgsys +; XXX more generally, never on machines where sizeof(void *) != sizeof(int) +#if defined(SYSVSHM) && !defined(alpha) +171 COMPAT_43 { int shmsys(int which, int a2, int a3, int a4); } +#else +171 UNIMPL shmsys +#endif +172 UNIMPL +173 UNIMPL +174 UNIMPL +175 UNIMPL +176 UNIMPL +177 UNIMPL +178 UNIMPL +179 UNIMPL +180 UNIMPL + +; Syscalls 180-209 are used by/reserved for BSD +181 STD { int setgid(gid_t gid); } +182 STD { int setegid(gid_t egid); } +183 STD { int seteuid(uid_t euid); } +#ifdef LFS +184 STD { int lfs_bmapv(fsid_t *fsidp, \ + struct block_info *blkiov, int blkcnt); } +185 STD { int lfs_markv(fsid_t *fsidp, \ + struct block_info *blkiov, int blkcnt); } +186 STD { int lfs_segclean(fsid_t *fsidp, u_long segment); } +187 STD { int lfs_segwait(fsid_t *fsidp, struct timeval *tv); } +#else +184 UNIMPL lfs_bmapv +185 UNIMPL lfs_markv +186 UNIMPL lfs_segclean +187 UNIMPL lfs_segwait +#endif +188 STD { int stat(char *path, struct stat *ub); } +189 STD { int fstat(int fd, struct stat *sb); } +190 STD { int lstat(char *path, struct stat *ub); } +191 STD { int pathconf(char *path, int name); } +192 STD { int fpathconf(int fd, int name); } +193 UNIMPL +194 STD { int getrlimit(u_int which, struct rlimit *rlp); } +195 STD { int setrlimit(u_int which, struct rlimit *rlp); } +196 STD { int getdirentries(int fd, char *buf, u_int count, \ + long *basep); } +197 STD { caddr_t mmap(caddr_t addr, size_t len, int prot, \ + int flags, int fd, long pad, off_t pos); } +198 STD { int nosys(void); } __syscall +199 STD { off_t lseek(int fd, int pad, off_t offset, \ + int whence); } +200 STD { int truncate(char *path, int pad, off_t length); } +201 STD { int ftruncate(int fd, int pad, off_t length); } +202 STD { int __sysctl(int *name, u_int namelen, void *old, \ + size_t *oldlenp, void *new, size_t newlen); } +203 STD { int mlock(caddr_t addr, size_t len); } +204 STD { int munlock(caddr_t addr, size_t len); } +205 STD { int undelete(char *path); } +206 UNIMPL +207 UNIMPL +208 UNIMPL +209 UNIMPL +; Syscalls 210-219 are used by/reserved for vendor-specific system calls +210 UNIMPL +211 UNIMPL +212 UNIMPL +213 UNIMPL +214 UNIMPL +215 UNIMPL +216 UNIMPL +217 UNIMPL +218 UNIMPL +219 UNIMPL +; System calls 220-240 are reserved for use by BSD +220 UNIMPL semctl +221 UNIMPL semget +222 UNIMPL semop +223 UNIMPL semconfig +224 UNIMPL msgctl +225 UNIMPL msgget +226 UNIMPL msgsnd +227 UNIMPL msgrcv +#if defined(SYSVSHM) && 0 +228 STD { int shmat(int shmid, void *shmaddr, int shmflg); } +229 STD { int shmctl(int shmid, int cmd, \ + struct shmid_ds *buf); } +230 STD { int shmdt(void *shmaddr); } +231 STD { int shmget(key_t key, int size, int shmflg); } +#else +228 UNIMPL shmat +229 UNIMPL shmctl +230 UNIMPL shmdt +231 UNIMPL shmget +#endif diff --git a/sys/kern/tty.c b/sys/kern/tty.c new file mode 100644 index 000000000000..5d698b111d6a --- /dev/null +++ b/sys/kern/tty.c @@ -0,0 +1,1927 @@ +/*- + * Copyright (c) 1982, 1986, 1990, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tty.c 8.13 (Berkeley) 1/9/95 + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/ioctl.h> +#include <sys/proc.h> +#define TTYDEFCHARS +#include <sys/tty.h> +#undef TTYDEFCHARS +#include <sys/file.h> +#include <sys/conf.h> +#include <sys/dkstat.h> +#include <sys/uio.h> +#include <sys/kernel.h> +#include <sys/vnode.h> +#include <sys/syslog.h> + +#include <vm/vm.h> + +static int proc_compare __P((struct proc *p1, struct proc *p2)); +static int ttnread __P((struct tty *)); +static void ttyblock __P((struct tty *tp)); +static void ttyecho __P((int, struct tty *tp)); +static void ttyrubo __P((struct tty *, int)); + +/* Symbolic sleep message strings. */ +char ttclos[] = "ttycls"; +char ttopen[] = "ttyopn"; +char ttybg[] = "ttybg"; +char ttybuf[] = "ttybuf"; +char ttyin[] = "ttyin"; +char ttyout[] = "ttyout"; + +/* + * Table with character classes and parity. The 8th bit indicates parity, + * the 7th bit indicates the character is an alphameric or underscore (for + * ALTWERASE), and the low 6 bits indicate delay type. If the low 6 bits + * are 0 then the character needs no special processing on output; classes + * other than 0 might be translated or (not currently) require delays. + */ +#define E 0x00 /* Even parity. */ +#define O 0x80 /* Odd parity. */ +#define PARITY(c) (char_type[c] & O) + +#define ALPHA 0x40 /* Alpha or underscore. */ +#define ISALPHA(c) (char_type[(c) & TTY_CHARMASK] & ALPHA) + +#define CCLASSMASK 0x3f +#define CCLASS(c) (char_type[c] & CCLASSMASK) + +#define BS BACKSPACE +#define CC CONTROL +#define CR RETURN +#define NA ORDINARY | ALPHA +#define NL NEWLINE +#define NO ORDINARY +#define TB TAB +#define VT VTAB + +char const char_type[] = { + E|CC, O|CC, O|CC, E|CC, O|CC, E|CC, E|CC, O|CC, /* nul - bel */ + O|BS, E|TB, E|NL, O|CC, E|VT, O|CR, O|CC, E|CC, /* bs - si */ + O|CC, E|CC, E|CC, O|CC, E|CC, O|CC, O|CC, E|CC, /* dle - etb */ + E|CC, O|CC, O|CC, E|CC, O|CC, E|CC, E|CC, O|CC, /* can - us */ + O|NO, E|NO, E|NO, O|NO, E|NO, O|NO, O|NO, E|NO, /* sp - ' */ + E|NO, O|NO, O|NO, E|NO, O|NO, E|NO, E|NO, O|NO, /* ( - / */ + E|NA, O|NA, O|NA, E|NA, O|NA, E|NA, E|NA, O|NA, /* 0 - 7 */ + O|NA, E|NA, E|NO, O|NO, E|NO, O|NO, O|NO, E|NO, /* 8 - ? */ + O|NO, E|NA, E|NA, O|NA, E|NA, O|NA, O|NA, E|NA, /* @ - G */ + E|NA, O|NA, O|NA, E|NA, O|NA, E|NA, E|NA, O|NA, /* H - O */ + E|NA, O|NA, O|NA, E|NA, O|NA, E|NA, E|NA, O|NA, /* P - W */ + O|NA, E|NA, E|NA, O|NO, E|NO, O|NO, O|NO, O|NA, /* X - _ */ + E|NO, O|NA, O|NA, E|NA, O|NA, E|NA, E|NA, O|NA, /* ` - g */ + O|NA, E|NA, E|NA, O|NA, E|NA, O|NA, O|NA, E|NA, /* h - o */ + O|NA, E|NA, E|NA, O|NA, E|NA, O|NA, O|NA, E|NA, /* p - w */ + E|NA, O|NA, O|NA, E|NO, O|NO, E|NO, E|NO, O|CC, /* x - del */ + /* + * Meta chars; should be settable per character set; + * for now, treat them all as normal characters. + */ + NA, NA, NA, NA, NA, NA, NA, NA, + NA, NA, NA, NA, NA, NA, NA, NA, + NA, NA, NA, NA, NA, NA, NA, NA, + NA, NA, NA, NA, NA, NA, NA, NA, + NA, NA, NA, NA, NA, NA, NA, NA, + NA, NA, NA, NA, NA, NA, NA, NA, + NA, NA, NA, NA, NA, NA, NA, NA, + NA, NA, NA, NA, NA, NA, NA, NA, + NA, NA, NA, NA, NA, NA, NA, NA, + NA, NA, NA, NA, NA, NA, NA, NA, + NA, NA, NA, NA, NA, NA, NA, NA, + NA, NA, NA, NA, NA, NA, NA, NA, + NA, NA, NA, NA, NA, NA, NA, NA, + NA, NA, NA, NA, NA, NA, NA, NA, + NA, NA, NA, NA, NA, NA, NA, NA, + NA, NA, NA, NA, NA, NA, NA, NA, +}; +#undef BS +#undef CC +#undef CR +#undef NA +#undef NL +#undef NO +#undef TB +#undef VT + +/* Macros to clear/set/test flags. */ +#define SET(t, f) (t) |= (f) +#define CLR(t, f) (t) &= ~(f) +#define ISSET(t, f) ((t) & (f)) + +/* + * Initial open of tty, or (re)entry to standard tty line discipline. + */ +int +ttyopen(device, tp) + dev_t device; + register struct tty *tp; +{ + int s; + + s = spltty(); + tp->t_dev = device; + if (!ISSET(tp->t_state, TS_ISOPEN)) { + SET(tp->t_state, TS_ISOPEN); + bzero(&tp->t_winsize, sizeof(tp->t_winsize)); + } + CLR(tp->t_state, TS_WOPEN); + splx(s); + return (0); +} + +/* + * Handle close() on a tty line: flush and set to initial state, + * bumping generation number so that pending read/write calls + * can detect recycling of the tty. + */ +int +ttyclose(tp) + register struct tty *tp; +{ + extern struct tty *constty; /* Temporary virtual console. */ + + if (constty == tp) + constty = NULL; + + ttyflush(tp, FREAD | FWRITE); + + tp->t_gen++; + tp->t_pgrp = NULL; + tp->t_session = NULL; + tp->t_state = 0; + return (0); +} + +#define FLUSHQ(q) { \ + if ((q)->c_cc) \ + ndflush(q, (q)->c_cc); \ +} + +/* Is 'c' a line delimiter ("break" character)? */ +#define TTBREAKC(c) \ + ((c) == '\n' || ((c) == cc[VEOF] || \ + (c) == cc[VEOL] || (c) == cc[VEOL2]) && (c) != _POSIX_VDISABLE) + + +/* + * Process input of a single character received on a tty. + */ +int +ttyinput(c, tp) + register int c; + register struct tty *tp; +{ + register int iflag, lflag; + register u_char *cc; + int i, err; + + /* + * If input is pending take it first. + */ + lflag = tp->t_lflag; + if (ISSET(lflag, PENDIN)) + ttypend(tp); + /* + * Gather stats. + */ + if (ISSET(lflag, ICANON)) { + ++tk_cancc; + ++tp->t_cancc; + } else { + ++tk_rawcc; + ++tp->t_rawcc; + } + ++tk_nin; + + /* Handle exceptional conditions (break, parity, framing). */ + cc = tp->t_cc; + iflag = tp->t_iflag; + if (err = (ISSET(c, TTY_ERRORMASK))) { + CLR(c, TTY_ERRORMASK); + if (ISSET(err, TTY_FE) && !c) { /* Break. */ + if (ISSET(iflag, IGNBRK)) + goto endcase; + else if (ISSET(iflag, BRKINT) && + ISSET(lflag, ISIG) && + (cc[VINTR] != _POSIX_VDISABLE)) + c = cc[VINTR]; + else if (ISSET(iflag, PARMRK)) + goto parmrk; + } else if (ISSET(err, TTY_PE) && + ISSET(iflag, INPCK) || ISSET(err, TTY_FE)) { + if (ISSET(iflag, IGNPAR)) + goto endcase; + else if (ISSET(iflag, PARMRK)) { +parmrk: (void)putc(0377 | TTY_QUOTE, &tp->t_rawq); + (void)putc(0 | TTY_QUOTE, &tp->t_rawq); + (void)putc(c | TTY_QUOTE, &tp->t_rawq); + goto endcase; + } else + c = 0; + } + } + /* + * In tandem mode, check high water mark. + */ + if (ISSET(iflag, IXOFF)) + ttyblock(tp); + if (!ISSET(tp->t_state, TS_TYPEN) && ISSET(iflag, ISTRIP)) + CLR(c, 0x80); + if (!ISSET(lflag, EXTPROC)) { + /* + * Check for literal nexting very first + */ + if (ISSET(tp->t_state, TS_LNCH)) { + SET(c, TTY_QUOTE); + CLR(tp->t_state, TS_LNCH); + } + /* + * Scan for special characters. This code + * is really just a big case statement with + * non-constant cases. The bottom of the + * case statement is labeled ``endcase'', so goto + * it after a case match, or similar. + */ + + /* + * Control chars which aren't controlled + * by ICANON, ISIG, or IXON. + */ + if (ISSET(lflag, IEXTEN)) { + if (CCEQ(cc[VLNEXT], c)) { + if (ISSET(lflag, ECHO)) { + if (ISSET(lflag, ECHOE)) { + (void)ttyoutput('^', tp); + (void)ttyoutput('\b', tp); + } else + ttyecho(c, tp); + } + SET(tp->t_state, TS_LNCH); + goto endcase; + } + if (CCEQ(cc[VDISCARD], c)) { + if (ISSET(lflag, FLUSHO)) + CLR(tp->t_lflag, FLUSHO); + else { + ttyflush(tp, FWRITE); + ttyecho(c, tp); + if (tp->t_rawq.c_cc + tp->t_canq.c_cc) + ttyretype(tp); + SET(tp->t_lflag, FLUSHO); + } + goto startoutput; + } + } + /* + * Signals. + */ + if (ISSET(lflag, ISIG)) { + if (CCEQ(cc[VINTR], c) || CCEQ(cc[VQUIT], c)) { + if (!ISSET(lflag, NOFLSH)) + ttyflush(tp, FREAD | FWRITE); + ttyecho(c, tp); + pgsignal(tp->t_pgrp, + CCEQ(cc[VINTR], c) ? SIGINT : SIGQUIT, 1); + goto endcase; + } + if (CCEQ(cc[VSUSP], c)) { + if (!ISSET(lflag, NOFLSH)) + ttyflush(tp, FREAD); + ttyecho(c, tp); + pgsignal(tp->t_pgrp, SIGTSTP, 1); + goto endcase; + } + } + /* + * Handle start/stop characters. + */ + if (ISSET(iflag, IXON)) { + if (CCEQ(cc[VSTOP], c)) { + if (!ISSET(tp->t_state, TS_TTSTOP)) { + SET(tp->t_state, TS_TTSTOP); +#ifdef sun4c /* XXX */ + (*tp->t_stop)(tp, 0); +#else + (*cdevsw[major(tp->t_dev)].d_stop)(tp, + 0); +#endif + return (0); + } + if (!CCEQ(cc[VSTART], c)) + return (0); + /* + * if VSTART == VSTOP then toggle + */ + goto endcase; + } + if (CCEQ(cc[VSTART], c)) + goto restartoutput; + } + /* + * IGNCR, ICRNL, & INLCR + */ + if (c == '\r') { + if (ISSET(iflag, IGNCR)) + goto endcase; + else if (ISSET(iflag, ICRNL)) + c = '\n'; + } else if (c == '\n' && ISSET(iflag, INLCR)) + c = '\r'; + } + if (!ISSET(tp->t_lflag, EXTPROC) && ISSET(lflag, ICANON)) { + /* + * From here on down canonical mode character + * processing takes place. + */ + /* + * erase (^H / ^?) + */ + if (CCEQ(cc[VERASE], c)) { + if (tp->t_rawq.c_cc) + ttyrub(unputc(&tp->t_rawq), tp); + goto endcase; + } + /* + * kill (^U) + */ + if (CCEQ(cc[VKILL], c)) { + if (ISSET(lflag, ECHOKE) && + tp->t_rawq.c_cc == tp->t_rocount && + !ISSET(lflag, ECHOPRT)) + while (tp->t_rawq.c_cc) + ttyrub(unputc(&tp->t_rawq), tp); + else { + ttyecho(c, tp); + if (ISSET(lflag, ECHOK) || + ISSET(lflag, ECHOKE)) + ttyecho('\n', tp); + FLUSHQ(&tp->t_rawq); + tp->t_rocount = 0; + } + CLR(tp->t_state, TS_LOCAL); + goto endcase; + } + /* + * word erase (^W) + */ + if (CCEQ(cc[VWERASE], c)) { + int alt = ISSET(lflag, ALTWERASE); + int ctype; + + /* + * erase whitespace + */ + while ((c = unputc(&tp->t_rawq)) == ' ' || c == '\t') + ttyrub(c, tp); + if (c == -1) + goto endcase; + /* + * erase last char of word and remember the + * next chars type (for ALTWERASE) + */ + ttyrub(c, tp); + c = unputc(&tp->t_rawq); + if (c == -1) + goto endcase; + if (c == ' ' || c == '\t') { + (void)putc(c, &tp->t_rawq); + goto endcase; + } + ctype = ISALPHA(c); + /* + * erase rest of word + */ + do { + ttyrub(c, tp); + c = unputc(&tp->t_rawq); + if (c == -1) + goto endcase; + } while (c != ' ' && c != '\t' && + (alt == 0 || ISALPHA(c) == ctype)); + (void)putc(c, &tp->t_rawq); + goto endcase; + } + /* + * reprint line (^R) + */ + if (CCEQ(cc[VREPRINT], c)) { + ttyretype(tp); + goto endcase; + } + /* + * ^T - kernel info and generate SIGINFO + */ + if (CCEQ(cc[VSTATUS], c)) { + if (ISSET(lflag, ISIG)) + pgsignal(tp->t_pgrp, SIGINFO, 1); + if (!ISSET(lflag, NOKERNINFO)) + ttyinfo(tp); + goto endcase; + } + } + /* + * Check for input buffer overflow + */ + if (tp->t_rawq.c_cc + tp->t_canq.c_cc >= TTYHOG) { + if (ISSET(iflag, IMAXBEL)) { + if (tp->t_outq.c_cc < tp->t_hiwat) + (void)ttyoutput(CTRL('g'), tp); + } else + ttyflush(tp, FREAD | FWRITE); + goto endcase; + } + /* + * Put data char in q for user and + * wakeup on seeing a line delimiter. + */ + if (putc(c, &tp->t_rawq) >= 0) { + if (!ISSET(lflag, ICANON)) { + ttwakeup(tp); + ttyecho(c, tp); + goto endcase; + } + if (TTBREAKC(c)) { + tp->t_rocount = 0; + catq(&tp->t_rawq, &tp->t_canq); + ttwakeup(tp); + } else if (tp->t_rocount++ == 0) + tp->t_rocol = tp->t_column; + if (ISSET(tp->t_state, TS_ERASE)) { + /* + * end of prterase \.../ + */ + CLR(tp->t_state, TS_ERASE); + (void)ttyoutput('/', tp); + } + i = tp->t_column; + ttyecho(c, tp); + if (CCEQ(cc[VEOF], c) && ISSET(lflag, ECHO)) { + /* + * Place the cursor over the '^' of the ^D. + */ + i = min(2, tp->t_column - i); + while (i > 0) { + (void)ttyoutput('\b', tp); + i--; + } + } + } +endcase: + /* + * IXANY means allow any character to restart output. + */ + if (ISSET(tp->t_state, TS_TTSTOP) && + !ISSET(iflag, IXANY) && cc[VSTART] != cc[VSTOP]) + return (0); +restartoutput: + CLR(tp->t_lflag, FLUSHO); + CLR(tp->t_state, TS_TTSTOP); +startoutput: + return (ttstart(tp)); +} + +/* + * Output a single character on a tty, doing output processing + * as needed (expanding tabs, newline processing, etc.). + * Returns < 0 if succeeds, otherwise returns char to resend. + * Must be recursive. + */ +int +ttyoutput(c, tp) + register int c; + register struct tty *tp; +{ + register long oflag; + register int notout, col, s; + + oflag = tp->t_oflag; + if (!ISSET(oflag, OPOST)) { + if (ISSET(tp->t_lflag, FLUSHO)) + return (-1); + if (putc(c, &tp->t_outq)) + return (c); + tk_nout++; + tp->t_outcc++; + return (-1); + } + /* + * Do tab expansion if OXTABS is set. Special case if we external + * processing, we don't do the tab expansion because we'll probably + * get it wrong. If tab expansion needs to be done, let it happen + * externally. + */ + CLR(c, ~TTY_CHARMASK); + if (c == '\t' && + ISSET(oflag, OXTABS) && !ISSET(tp->t_lflag, EXTPROC)) { + c = 8 - (tp->t_column & 7); + if (ISSET(tp->t_lflag, FLUSHO)) { + notout = 0; + } else { + s = spltty(); /* Don't interrupt tabs. */ + notout = b_to_q(" ", c, &tp->t_outq); + c -= notout; + tk_nout += c; + tp->t_outcc += c; + splx(s); + } + tp->t_column += c; + return (notout ? '\t' : -1); + } + if (c == CEOT && ISSET(oflag, ONOEOT)) + return (-1); + + /* + * Newline translation: if ONLCR is set, + * translate newline into "\r\n". + */ + if (c == '\n' && ISSET(tp->t_oflag, ONLCR)) { + tk_nout++; + tp->t_outcc++; + if (putc('\r', &tp->t_outq)) + return (c); + } + tk_nout++; + tp->t_outcc++; + if (!ISSET(tp->t_lflag, FLUSHO) && putc(c, &tp->t_outq)) + return (c); + + col = tp->t_column; + switch (CCLASS(c)) { + case BACKSPACE: + if (col > 0) + --col; + break; + case CONTROL: + break; + case NEWLINE: + case RETURN: + col = 0; + break; + case ORDINARY: + ++col; + break; + case TAB: + col = (col + 8) & ~7; + break; + } + tp->t_column = col; + return (-1); +} + +/* + * Ioctls for all tty devices. Called after line-discipline specific ioctl + * has been called to do discipline-specific functions and/or reject any + * of these ioctl commands. + */ +/* ARGSUSED */ +int +ttioctl(tp, cmd, data, flag) + register struct tty *tp; + u_long cmd; + void *data; + int flag; +{ + extern struct tty *constty; /* Temporary virtual console. */ + extern int nlinesw; + register struct proc *p; + int s, error; + + p = curproc; /* XXX */ + + /* If the ioctl involves modification, hang if in the background. */ + switch (cmd) { + case TIOCFLUSH: + case TIOCSETA: + case TIOCSETD: + case TIOCSETAF: + case TIOCSETAW: +#ifdef notdef + case TIOCSPGRP: +#endif + case TIOCSTI: + case TIOCSWINSZ: +#if defined(COMPAT_43) || defined(COMPAT_SUNOS) + case TIOCLBIC: + case TIOCLBIS: + case TIOCLSET: + case TIOCSETC: + case OTIOCSETD: + case TIOCSETN: + case TIOCSETP: + case TIOCSLTC: +#endif + while (isbackground(curproc, tp) && + p->p_pgrp->pg_jobc && (p->p_flag & P_PPWAIT) == 0 && + (p->p_sigignore & sigmask(SIGTTOU)) == 0 && + (p->p_sigmask & sigmask(SIGTTOU)) == 0) { + pgsignal(p->p_pgrp, SIGTTOU, 1); + if (error = ttysleep(tp, + &lbolt, TTOPRI | PCATCH, ttybg, 0)) + return (error); + } + break; + } + + switch (cmd) { /* Process the ioctl. */ + case FIOASYNC: /* set/clear async i/o */ + s = spltty(); + if (*(int *)data) + SET(tp->t_state, TS_ASYNC); + else + CLR(tp->t_state, TS_ASYNC); + splx(s); + break; + case FIONBIO: /* set/clear non-blocking i/o */ + break; /* XXX: delete. */ + case FIONREAD: /* get # bytes to read */ + *(int *)data = ttnread(tp); + break; + case TIOCEXCL: /* set exclusive use of tty */ + s = spltty(); + SET(tp->t_state, TS_XCLUDE); + splx(s); + break; + case TIOCFLUSH: { /* flush buffers */ + register int flags = *(int *)data; + + if (flags == 0) + flags = FREAD | FWRITE; + else + flags &= FREAD | FWRITE; + ttyflush(tp, flags); + break; + } + case TIOCCONS: /* become virtual console */ + if (*(int *)data) { + if (constty && constty != tp && + ISSET(constty->t_state, TS_CARR_ON | TS_ISOPEN) == + (TS_CARR_ON | TS_ISOPEN)) + return (EBUSY); +#ifndef UCONSOLE + if (error = suser(p->p_ucred, &p->p_acflag)) + return (error); +#endif + constty = tp; + } else if (tp == constty) + constty = NULL; + break; + case TIOCDRAIN: /* wait till output drained */ + if (error = ttywait(tp)) + return (error); + break; + case TIOCGETA: { /* get termios struct */ + struct termios *t = (struct termios *)data; + + bcopy(&tp->t_termios, t, sizeof(struct termios)); + break; + } + case TIOCGETD: /* get line discipline */ + *(int *)data = tp->t_line; + break; + case TIOCGWINSZ: /* get window size */ + *(struct winsize *)data = tp->t_winsize; + break; + case TIOCGPGRP: /* get pgrp of tty */ + if (!isctty(p, tp)) + return (ENOTTY); + *(int *)data = tp->t_pgrp ? tp->t_pgrp->pg_id : NO_PID; + break; +#ifdef TIOCHPCL + case TIOCHPCL: /* hang up on last close */ + s = spltty(); + SET(tp->t_cflag, HUPCL); + splx(s); + break; +#endif + case TIOCNXCL: /* reset exclusive use of tty */ + s = spltty(); + CLR(tp->t_state, TS_XCLUDE); + splx(s); + break; + case TIOCOUTQ: /* output queue size */ + *(int *)data = tp->t_outq.c_cc; + break; + case TIOCSETA: /* set termios struct */ + case TIOCSETAW: /* drain output, set */ + case TIOCSETAF: { /* drn out, fls in, set */ + register struct termios *t = (struct termios *)data; + + s = spltty(); + if (cmd == TIOCSETAW || cmd == TIOCSETAF) { + if (error = ttywait(tp)) { + splx(s); + return (error); + } + if (cmd == TIOCSETAF) + ttyflush(tp, FREAD); + } + if (!ISSET(t->c_cflag, CIGNORE)) { + /* + * Set device hardware. + */ + if (tp->t_param && (error = (*tp->t_param)(tp, t))) { + splx(s); + return (error); + } else { + if (!ISSET(tp->t_state, TS_CARR_ON) && + ISSET(tp->t_cflag, CLOCAL) && + !ISSET(t->c_cflag, CLOCAL)) { + CLR(tp->t_state, TS_ISOPEN); + SET(tp->t_state, TS_WOPEN); + ttwakeup(tp); + } + tp->t_cflag = t->c_cflag; + tp->t_ispeed = t->c_ispeed; + tp->t_ospeed = t->c_ospeed; + } + ttsetwater(tp); + } + if (cmd != TIOCSETAF) { + if (ISSET(t->c_lflag, ICANON) != + ISSET(tp->t_lflag, ICANON)) + if (ISSET(t->c_lflag, ICANON)) { + SET(tp->t_lflag, PENDIN); + ttwakeup(tp); + } else { + struct clist tq; + + catq(&tp->t_rawq, &tp->t_canq); + tq = tp->t_rawq; + tp->t_rawq = tp->t_canq; + tp->t_canq = tq; + CLR(tp->t_lflag, PENDIN); + } + } + tp->t_iflag = t->c_iflag; + tp->t_oflag = t->c_oflag; + /* + * Make the EXTPROC bit read only. + */ + if (ISSET(tp->t_lflag, EXTPROC)) + SET(t->c_lflag, EXTPROC); + else + CLR(t->c_lflag, EXTPROC); + tp->t_lflag = t->c_lflag | ISSET(tp->t_lflag, PENDIN); + bcopy(t->c_cc, tp->t_cc, sizeof(t->c_cc)); + splx(s); + break; + } + case TIOCSETD: { /* set line discipline */ + register int t = *(int *)data; + dev_t device = tp->t_dev; + + if ((u_int)t >= nlinesw) + return (ENXIO); + if (t != tp->t_line) { + s = spltty(); + (*linesw[tp->t_line].l_close)(tp, flag); + error = (*linesw[t].l_open)(device, tp); + if (error) { + (void)(*linesw[tp->t_line].l_open)(device, tp); + splx(s); + return (error); + } + tp->t_line = t; + splx(s); + } + break; + } + case TIOCSTART: /* start output, like ^Q */ + s = spltty(); + if (ISSET(tp->t_state, TS_TTSTOP) || + ISSET(tp->t_lflag, FLUSHO)) { + CLR(tp->t_lflag, FLUSHO); + CLR(tp->t_state, TS_TTSTOP); + ttstart(tp); + } + splx(s); + break; + case TIOCSTI: /* simulate terminal input */ + if (p->p_ucred->cr_uid && (flag & FREAD) == 0) + return (EPERM); + if (p->p_ucred->cr_uid && !isctty(p, tp)) + return (EACCES); + (*linesw[tp->t_line].l_rint)(*(u_char *)data, tp); + break; + case TIOCSTOP: /* stop output, like ^S */ + s = spltty(); + if (!ISSET(tp->t_state, TS_TTSTOP)) { + SET(tp->t_state, TS_TTSTOP); +#ifdef sun4c /* XXX */ + (*tp->t_stop)(tp, 0); +#else + (*cdevsw[major(tp->t_dev)].d_stop)(tp, 0); +#endif + } + splx(s); + break; + case TIOCSCTTY: /* become controlling tty */ + /* Session ctty vnode pointer set in vnode layer. */ + if (!SESS_LEADER(p) || + (p->p_session->s_ttyvp || tp->t_session) && + (tp->t_session != p->p_session)) + return (EPERM); + tp->t_session = p->p_session; + tp->t_pgrp = p->p_pgrp; + p->p_session->s_ttyp = tp; + p->p_flag |= P_CONTROLT; + break; + case TIOCSPGRP: { /* set pgrp of tty */ + register struct pgrp *pgrp = pgfind(*(int *)data); + + if (!isctty(p, tp)) + return (ENOTTY); + else if (pgrp == NULL || pgrp->pg_session != p->p_session) + return (EPERM); + tp->t_pgrp = pgrp; + break; + } + case TIOCSWINSZ: /* set window size */ + if (bcmp((caddr_t)&tp->t_winsize, data, + sizeof (struct winsize))) { + tp->t_winsize = *(struct winsize *)data; + pgsignal(tp->t_pgrp, SIGWINCH, 1); + } + break; + default: +#if defined(COMPAT_43) || defined(COMPAT_SUNOS) + return (ttcompat(tp, cmd, data, flag)); +#else + return (-1); +#endif + } + return (0); +} + +int +ttselect(device, rw, p) + dev_t device; + int rw; + struct proc *p; +{ + register struct tty *tp; + int nread, s; + + tp = &cdevsw[major(device)].d_ttys[minor(device)]; + + s = spltty(); + switch (rw) { + case FREAD: + nread = ttnread(tp); + if (nread > 0 || !ISSET(tp->t_cflag, CLOCAL) && + !ISSET(tp->t_state, TS_CARR_ON)) + goto win; + selrecord(p, &tp->t_rsel); + break; + case FWRITE: + if (tp->t_outq.c_cc <= tp->t_lowat) { +win: splx(s); + return (1); + } + selrecord(p, &tp->t_wsel); + break; + } + splx(s); + return (0); +} + +static int +ttnread(tp) + struct tty *tp; +{ + int nread; + + if (ISSET(tp->t_lflag, PENDIN)) + ttypend(tp); + nread = tp->t_canq.c_cc; + if (!ISSET(tp->t_lflag, ICANON)) + nread += tp->t_rawq.c_cc; + return (nread); +} + +/* + * Wait for output to drain. + */ +int +ttywait(tp) + register struct tty *tp; +{ + int error, s; + + error = 0; + s = spltty(); + while ((tp->t_outq.c_cc || ISSET(tp->t_state, TS_BUSY)) && + (ISSET(tp->t_state, TS_CARR_ON) || ISSET(tp->t_cflag, CLOCAL)) + && tp->t_oproc) { + (*tp->t_oproc)(tp); + SET(tp->t_state, TS_ASLEEP); + if (error = ttysleep(tp, + &tp->t_outq, TTOPRI | PCATCH, ttyout, 0)) + break; + } + splx(s); + return (error); +} + +/* + * Flush if successfully wait. + */ +int +ttywflush(tp) + struct tty *tp; +{ + int error; + + if ((error = ttywait(tp)) == 0) + ttyflush(tp, FREAD); + return (error); +} + +/* + * Flush tty read and/or write queues, notifying anyone waiting. + */ +void +ttyflush(tp, rw) + register struct tty *tp; + int rw; +{ + register int s; + + s = spltty(); + if (rw & FREAD) { + FLUSHQ(&tp->t_canq); + FLUSHQ(&tp->t_rawq); + tp->t_rocount = 0; + tp->t_rocol = 0; + CLR(tp->t_state, TS_LOCAL); + ttwakeup(tp); + } + if (rw & FWRITE) { + CLR(tp->t_state, TS_TTSTOP); +#ifdef sun4c /* XXX */ + (*tp->t_stop)(tp, rw); +#else + (*cdevsw[major(tp->t_dev)].d_stop)(tp, rw); +#endif + FLUSHQ(&tp->t_outq); + wakeup((caddr_t)&tp->t_outq); + selwakeup(&tp->t_wsel); + } + splx(s); +} + +/* + * Copy in the default termios characters. + */ +void +ttychars(tp) + struct tty *tp; +{ + + bcopy(ttydefchars, tp->t_cc, sizeof(ttydefchars)); +} + +/* + * Send stop character on input overflow. + */ +static void +ttyblock(tp) + register struct tty *tp; +{ + register int total; + + total = tp->t_rawq.c_cc + tp->t_canq.c_cc; + if (tp->t_rawq.c_cc > TTYHOG) { + ttyflush(tp, FREAD | FWRITE); + CLR(tp->t_state, TS_TBLOCK); + } + /* + * Block further input iff: current input > threshold + * AND input is available to user program. + */ + if (total >= TTYHOG / 2 && + !ISSET(tp->t_state, TS_TBLOCK) && + !ISSET(tp->t_lflag, ICANON) || tp->t_canq.c_cc > 0 && + tp->t_cc[VSTOP] != _POSIX_VDISABLE) { + if (putc(tp->t_cc[VSTOP], &tp->t_outq) == 0) { + SET(tp->t_state, TS_TBLOCK); + ttstart(tp); + } + } +} + +void +ttrstrt(tp_arg) + void *tp_arg; +{ + struct tty *tp; + int s; + +#ifdef DIAGNOSTIC + if (tp_arg == NULL) + panic("ttrstrt"); +#endif + tp = tp_arg; + s = spltty(); + + CLR(tp->t_state, TS_TIMEOUT); + ttstart(tp); + + splx(s); +} + +int +ttstart(tp) + struct tty *tp; +{ + + if (tp->t_oproc != NULL) /* XXX: Kludge for pty. */ + (*tp->t_oproc)(tp); + return (0); +} + +/* + * "close" a line discipline + */ +int +ttylclose(tp, flag) + struct tty *tp; + int flag; +{ + + if (flag & IO_NDELAY) + ttyflush(tp, FREAD | FWRITE); + else + ttywflush(tp); + return (0); +} + +/* + * Handle modem control transition on a tty. + * Flag indicates new state of carrier. + * Returns 0 if the line should be turned off, otherwise 1. + */ +int +ttymodem(tp, flag) + register struct tty *tp; + int flag; +{ + + if (!ISSET(tp->t_state, TS_WOPEN) && ISSET(tp->t_cflag, MDMBUF)) { + /* + * MDMBUF: do flow control according to carrier flag + */ + if (flag) { + CLR(tp->t_state, TS_TTSTOP); + ttstart(tp); + } else if (!ISSET(tp->t_state, TS_TTSTOP)) { + SET(tp->t_state, TS_TTSTOP); +#ifdef sun4c /* XXX */ + (*tp->t_stop)(tp, 0); +#else + (*cdevsw[major(tp->t_dev)].d_stop)(tp, 0); +#endif + } + } else if (flag == 0) { + /* + * Lost carrier. + */ + CLR(tp->t_state, TS_CARR_ON); + if (ISSET(tp->t_state, TS_ISOPEN) && + !ISSET(tp->t_cflag, CLOCAL)) { + if (tp->t_session && tp->t_session->s_leader) + psignal(tp->t_session->s_leader, SIGHUP); + ttyflush(tp, FREAD | FWRITE); + return (0); + } + } else { + /* + * Carrier now on. + */ + SET(tp->t_state, TS_CARR_ON); + ttwakeup(tp); + } + return (1); +} + +/* + * Default modem control routine (for other line disciplines). + * Return argument flag, to turn off device on carrier drop. + */ +int +nullmodem(tp, flag) + register struct tty *tp; + int flag; +{ + + if (flag) + SET(tp->t_state, TS_CARR_ON); + else { + CLR(tp->t_state, TS_CARR_ON); + if (!ISSET(tp->t_cflag, CLOCAL)) { + if (tp->t_session && tp->t_session->s_leader) + psignal(tp->t_session->s_leader, SIGHUP); + return (0); + } + } + return (1); +} + +/* + * Reinput pending characters after state switch + * call at spltty(). + */ +void +ttypend(tp) + register struct tty *tp; +{ + struct clist tq; + register c; + + CLR(tp->t_lflag, PENDIN); + SET(tp->t_state, TS_TYPEN); + tq = tp->t_rawq; + tp->t_rawq.c_cc = 0; + tp->t_rawq.c_cf = tp->t_rawq.c_cl = 0; + while ((c = getc(&tq)) >= 0) + ttyinput(c, tp); + CLR(tp->t_state, TS_TYPEN); +} + +/* + * Process a read call on a tty device. + */ +int +ttread(tp, uio, flag) + register struct tty *tp; + struct uio *uio; + int flag; +{ + register struct clist *qp; + register int c; + register long lflag; + register u_char *cc = tp->t_cc; + register struct proc *p = curproc; + int s, first, error = 0; + +loop: lflag = tp->t_lflag; + s = spltty(); + /* + * take pending input first + */ + if (ISSET(lflag, PENDIN)) + ttypend(tp); + splx(s); + + /* + * Hang process if it's in the background. + */ + if (isbackground(p, tp)) { + if ((p->p_sigignore & sigmask(SIGTTIN)) || + (p->p_sigmask & sigmask(SIGTTIN)) || + p->p_flag & P_PPWAIT || p->p_pgrp->pg_jobc == 0) + return (EIO); + pgsignal(p->p_pgrp, SIGTTIN, 1); + if (error = ttysleep(tp, &lbolt, TTIPRI | PCATCH, ttybg, 0)) + return (error); + goto loop; + } + + /* + * If canonical, use the canonical queue, + * else use the raw queue. + * + * (should get rid of clists...) + */ + qp = ISSET(lflag, ICANON) ? &tp->t_canq : &tp->t_rawq; + + /* + * If there is no input, sleep on rawq + * awaiting hardware receipt and notification. + * If we have data, we don't need to check for carrier. + */ + s = spltty(); + if (qp->c_cc <= 0) { + int carrier; + + carrier = ISSET(tp->t_state, TS_CARR_ON) || + ISSET(tp->t_cflag, CLOCAL); + if (!carrier && ISSET(tp->t_state, TS_ISOPEN)) { + splx(s); + return (0); /* EOF */ + } + if (flag & IO_NDELAY) { + splx(s); + return (EWOULDBLOCK); + } + error = ttysleep(tp, &tp->t_rawq, TTIPRI | PCATCH, + carrier ? ttyin : ttopen, 0); + splx(s); + if (error) + return (error); + goto loop; + } + splx(s); + + /* + * Input present, check for input mapping and processing. + */ + first = 1; + while ((c = getc(qp)) >= 0) { + /* + * delayed suspend (^Y) + */ + if (CCEQ(cc[VDSUSP], c) && ISSET(lflag, ISIG)) { + pgsignal(tp->t_pgrp, SIGTSTP, 1); + if (first) { + if (error = ttysleep(tp, + &lbolt, TTIPRI | PCATCH, ttybg, 0)) + break; + goto loop; + } + break; + } + /* + * Interpret EOF only in canonical mode. + */ + if (CCEQ(cc[VEOF], c) && ISSET(lflag, ICANON)) + break; + /* + * Give user character. + */ + error = ureadc(c, uio); + if (error) + break; + if (uio->uio_resid == 0) + break; + /* + * In canonical mode check for a "break character" + * marking the end of a "line of input". + */ + if (ISSET(lflag, ICANON) && TTBREAKC(c)) + break; + first = 0; + } + /* + * Look to unblock output now that (presumably) + * the input queue has gone down. + */ + s = spltty(); + if (ISSET(tp->t_state, TS_TBLOCK) && tp->t_rawq.c_cc < TTYHOG/5) { + if (cc[VSTART] != _POSIX_VDISABLE && + putc(cc[VSTART], &tp->t_outq) == 0) { + CLR(tp->t_state, TS_TBLOCK); + ttstart(tp); + } + } + splx(s); + return (error); +} + +/* + * Check the output queue on tp for space for a kernel message (from uprintf + * or tprintf). Allow some space over the normal hiwater mark so we don't + * lose messages due to normal flow control, but don't let the tty run amok. + * Sleeps here are not interruptible, but we return prematurely if new signals + * arrive. + */ +int +ttycheckoutq(tp, wait) + register struct tty *tp; + int wait; +{ + int hiwat, s, oldsig; + + hiwat = tp->t_hiwat; + s = spltty(); + oldsig = wait ? curproc->p_siglist : 0; + if (tp->t_outq.c_cc > hiwat + 200) + while (tp->t_outq.c_cc > hiwat) { + ttstart(tp); + if (wait == 0 || curproc->p_siglist != oldsig) { + splx(s); + return (0); + } + timeout((void (*)__P((void *)))wakeup, + (void *)&tp->t_outq, hz); + SET(tp->t_state, TS_ASLEEP); + sleep((caddr_t)&tp->t_outq, PZERO - 1); + } + splx(s); + return (1); +} + +/* + * Process a write call on a tty device. + */ +int +ttwrite(tp, uio, flag) + register struct tty *tp; + register struct uio *uio; + int flag; +{ + register char *cp; + register int cc, ce; + register struct proc *p; + int i, hiwat, cnt, error, s; + char obuf[OBUFSIZ]; + + hiwat = tp->t_hiwat; + cnt = uio->uio_resid; + error = 0; + cc = 0; +loop: + s = spltty(); + if (!ISSET(tp->t_state, TS_CARR_ON) && + !ISSET(tp->t_cflag, CLOCAL)) { + if (ISSET(tp->t_state, TS_ISOPEN)) { + splx(s); + return (EIO); + } else if (flag & IO_NDELAY) { + splx(s); + error = EWOULDBLOCK; + goto out; + } else { + /* Sleep awaiting carrier. */ + error = ttysleep(tp, + &tp->t_rawq, TTIPRI | PCATCH,ttopen, 0); + splx(s); + if (error) + goto out; + goto loop; + } + } + splx(s); + /* + * Hang the process if it's in the background. + */ + p = curproc; + if (isbackground(p, tp) && + ISSET(tp->t_lflag, TOSTOP) && (p->p_flag & P_PPWAIT) == 0 && + (p->p_sigignore & sigmask(SIGTTOU)) == 0 && + (p->p_sigmask & sigmask(SIGTTOU)) == 0 && + p->p_pgrp->pg_jobc) { + pgsignal(p->p_pgrp, SIGTTOU, 1); + if (error = ttysleep(tp, &lbolt, TTIPRI | PCATCH, ttybg, 0)) + goto out; + goto loop; + } + /* + * Process the user's data in at most OBUFSIZ chunks. Perform any + * output translation. Keep track of high water mark, sleep on + * overflow awaiting device aid in acquiring new space. + */ + while (uio->uio_resid > 0 || cc > 0) { + if (ISSET(tp->t_lflag, FLUSHO)) { + uio->uio_resid = 0; + return (0); + } + if (tp->t_outq.c_cc > hiwat) + goto ovhiwat; + /* + * Grab a hunk of data from the user, unless we have some + * leftover from last time. + */ + if (cc == 0) { + cc = min(uio->uio_resid, OBUFSIZ); + cp = obuf; + error = uiomove(cp, cc, uio); + if (error) { + cc = 0; + break; + } + } + /* + * If nothing fancy need be done, grab those characters we + * can handle without any of ttyoutput's processing and + * just transfer them to the output q. For those chars + * which require special processing (as indicated by the + * bits in char_type), call ttyoutput. After processing + * a hunk of data, look for FLUSHO so ^O's will take effect + * immediately. + */ + while (cc > 0) { + if (!ISSET(tp->t_oflag, OPOST)) + ce = cc; + else { + ce = cc - scanc((u_int)cc, (u_char *)cp, + (u_char *)char_type, CCLASSMASK); + /* + * If ce is zero, then we're processing + * a special character through ttyoutput. + */ + if (ce == 0) { + tp->t_rocount = 0; + if (ttyoutput(*cp, tp) >= 0) { + /* No Clists, wait a bit. */ + ttstart(tp); + if (error = ttysleep(tp, &lbolt, + TTOPRI | PCATCH, ttybuf, 0)) + break; + goto loop; + } + cp++; + cc--; + if (ISSET(tp->t_lflag, FLUSHO) || + tp->t_outq.c_cc > hiwat) + goto ovhiwat; + continue; + } + } + /* + * A bunch of normal characters have been found. + * Transfer them en masse to the output queue and + * continue processing at the top of the loop. + * If there are any further characters in this + * <= OBUFSIZ chunk, the first should be a character + * requiring special handling by ttyoutput. + */ + tp->t_rocount = 0; + i = b_to_q(cp, ce, &tp->t_outq); + ce -= i; + tp->t_column += ce; + cp += ce, cc -= ce, tk_nout += ce; + tp->t_outcc += ce; + if (i > 0) { + /* No Clists, wait a bit. */ + ttstart(tp); + if (error = ttysleep(tp, + &lbolt, TTOPRI | PCATCH, ttybuf, 0)) + break; + goto loop; + } + if (ISSET(tp->t_lflag, FLUSHO) || + tp->t_outq.c_cc > hiwat) + break; + } + ttstart(tp); + } +out: + /* + * If cc is nonzero, we leave the uio structure inconsistent, as the + * offset and iov pointers have moved forward, but it doesn't matter + * (the call will either return short or restart with a new uio). + */ + uio->uio_resid += cc; + return (error); + +ovhiwat: + ttstart(tp); + s = spltty(); + /* + * This can only occur if FLUSHO is set in t_lflag, + * or if ttstart/oproc is synchronous (or very fast). + */ + if (tp->t_outq.c_cc <= hiwat) { + splx(s); + goto loop; + } + if (flag & IO_NDELAY) { + splx(s); + uio->uio_resid += cc; + return (uio->uio_resid == cnt ? EWOULDBLOCK : 0); + } + SET(tp->t_state, TS_ASLEEP); + error = ttysleep(tp, &tp->t_outq, TTOPRI | PCATCH, ttyout, 0); + splx(s); + if (error) + goto out; + goto loop; +} + +/* + * Rubout one character from the rawq of tp + * as cleanly as possible. + */ +void +ttyrub(c, tp) + register int c; + register struct tty *tp; +{ + register char *cp; + register int savecol; + int tabc, s; + + if (!ISSET(tp->t_lflag, ECHO) || ISSET(tp->t_lflag, EXTPROC)) + return; + CLR(tp->t_lflag, FLUSHO); + if (ISSET(tp->t_lflag, ECHOE)) { + if (tp->t_rocount == 0) { + /* + * Screwed by ttwrite; retype + */ + ttyretype(tp); + return; + } + if (c == ('\t' | TTY_QUOTE) || c == ('\n' | TTY_QUOTE)) + ttyrubo(tp, 2); + else { + CLR(c, ~TTY_CHARMASK); + switch (CCLASS(c)) { + case ORDINARY: + ttyrubo(tp, 1); + break; + case BACKSPACE: + case CONTROL: + case NEWLINE: + case RETURN: + case VTAB: + if (ISSET(tp->t_lflag, ECHOCTL)) + ttyrubo(tp, 2); + break; + case TAB: + if (tp->t_rocount < tp->t_rawq.c_cc) { + ttyretype(tp); + return; + } + s = spltty(); + savecol = tp->t_column; + SET(tp->t_state, TS_CNTTB); + SET(tp->t_lflag, FLUSHO); + tp->t_column = tp->t_rocol; + cp = tp->t_rawq.c_cf; + if (cp) + tabc = *cp; /* XXX FIX NEXTC */ + for (; cp; cp = nextc(&tp->t_rawq, cp, &tabc)) + ttyecho(tabc, tp); + CLR(tp->t_lflag, FLUSHO); + CLR(tp->t_state, TS_CNTTB); + splx(s); + + /* savecol will now be length of the tab. */ + savecol -= tp->t_column; + tp->t_column += savecol; + if (savecol > 8) + savecol = 8; /* overflow screw */ + while (--savecol >= 0) + (void)ttyoutput('\b', tp); + break; + default: /* XXX */ +#define PANICSTR "ttyrub: would panic c = %d, val = %d\n" + (void)printf(PANICSTR, c, CCLASS(c)); +#ifdef notdef + panic(PANICSTR, c, CCLASS(c)); +#endif + } + } + } else if (ISSET(tp->t_lflag, ECHOPRT)) { + if (!ISSET(tp->t_state, TS_ERASE)) { + SET(tp->t_state, TS_ERASE); + (void)ttyoutput('\\', tp); + } + ttyecho(c, tp); + } else + ttyecho(tp->t_cc[VERASE], tp); + --tp->t_rocount; +} + +/* + * Back over cnt characters, erasing them. + */ +static void +ttyrubo(tp, cnt) + register struct tty *tp; + int cnt; +{ + + while (cnt-- > 0) { + (void)ttyoutput('\b', tp); + (void)ttyoutput(' ', tp); + (void)ttyoutput('\b', tp); + } +} + +/* + * ttyretype -- + * Reprint the rawq line. Note, it is assumed that c_cc has already + * been checked. + */ +void +ttyretype(tp) + register struct tty *tp; +{ + register char *cp; + int s, c; + + /* Echo the reprint character. */ + if (tp->t_cc[VREPRINT] != _POSIX_VDISABLE) + ttyecho(tp->t_cc[VREPRINT], tp); + + (void)ttyoutput('\n', tp); + + /* + * XXX + * FIX: NEXTC IS BROKEN - DOESN'T CHECK QUOTE + * BIT OF FIRST CHAR. + */ + s = spltty(); + for (cp = tp->t_canq.c_cf, c = (cp != NULL ? *cp : 0); + cp != NULL; cp = nextc(&tp->t_canq, cp, &c)) + ttyecho(c, tp); + for (cp = tp->t_rawq.c_cf, c = (cp != NULL ? *cp : 0); + cp != NULL; cp = nextc(&tp->t_rawq, cp, &c)) + ttyecho(c, tp); + CLR(tp->t_state, TS_ERASE); + splx(s); + + tp->t_rocount = tp->t_rawq.c_cc; + tp->t_rocol = 0; +} + +/* + * Echo a typed character to the terminal. + */ +static void +ttyecho(c, tp) + register int c; + register struct tty *tp; +{ + + if (!ISSET(tp->t_state, TS_CNTTB)) + CLR(tp->t_lflag, FLUSHO); + if ((!ISSET(tp->t_lflag, ECHO) && + (!ISSET(tp->t_lflag, ECHONL) || c == '\n')) || + ISSET(tp->t_lflag, EXTPROC)) + return; + if (ISSET(tp->t_lflag, ECHOCTL) && + (ISSET(c, TTY_CHARMASK) <= 037 && c != '\t' && c != '\n' || + ISSET(c, TTY_CHARMASK) == 0177)) { + (void)ttyoutput('^', tp); + CLR(c, ~TTY_CHARMASK); + if (c == 0177) + c = '?'; + else + c += 'A' - 1; + } + (void)ttyoutput(c, tp); +} + +/* + * Wake up any readers on a tty. + */ +void +ttwakeup(tp) + register struct tty *tp; +{ + + selwakeup(&tp->t_rsel); + if (ISSET(tp->t_state, TS_ASYNC)) + pgsignal(tp->t_pgrp, SIGIO, 1); + wakeup((caddr_t)&tp->t_rawq); +} + +/* + * Look up a code for a specified speed in a conversion table; + * used by drivers to map software speed values to hardware parameters. + */ +int +ttspeedtab(speed, table) + int speed; + register struct speedtab *table; +{ + + for ( ; table->sp_speed != -1; table++) + if (table->sp_speed == speed) + return (table->sp_code); + return (-1); +} + +/* + * Set tty hi and low water marks. + * + * Try to arrange the dynamics so there's about one second + * from hi to low water. + * + */ +void +ttsetwater(tp) + struct tty *tp; +{ + register int cps, x; + +#define CLAMP(x, h, l) ((x) > h ? h : ((x) < l) ? l : (x)) + + cps = tp->t_ospeed / 10; + tp->t_lowat = x = CLAMP(cps / 2, TTMAXLOWAT, TTMINLOWAT); + x += cps; + x = CLAMP(x, TTMAXHIWAT, TTMINHIWAT); + tp->t_hiwat = roundup(x, CBSIZE); +#undef CLAMP +} + +/* + * Report on state of foreground process group. + */ +void +ttyinfo(tp) + register struct tty *tp; +{ + register struct proc *p, *pick; + struct timeval utime, stime; + int tmp; + + if (ttycheckoutq(tp,0) == 0) + return; + + /* Print load average. */ + tmp = (averunnable.ldavg[0] * 100 + FSCALE / 2) >> FSHIFT; + ttyprintf(tp, "load: %d.%02d ", tmp / 100, tmp % 100); + + if (tp->t_session == NULL) + ttyprintf(tp, "not a controlling terminal\n"); + else if (tp->t_pgrp == NULL) + ttyprintf(tp, "no foreground process group\n"); + else if ((p = tp->t_pgrp->pg_members.lh_first) == 0) + ttyprintf(tp, "empty foreground process group\n"); + else { + /* Pick interesting process. */ + for (pick = NULL; p != 0; p = p->p_pglist.le_next) + if (proc_compare(pick, p)) + pick = p; + + ttyprintf(tp, " cmd: %s %d [%s] ", pick->p_comm, pick->p_pid, + pick->p_stat == SRUN ? "running" : + pick->p_wmesg ? pick->p_wmesg : "iowait"); + + calcru(pick, &utime, &stime, NULL); + + /* Print user time. */ + ttyprintf(tp, "%d.%02du ", + utime.tv_sec, (utime.tv_usec + 5000) / 10000); + + /* Print system time. */ + ttyprintf(tp, "%d.%02ds ", + stime.tv_sec, (stime.tv_usec + 5000) / 10000); + +#define pgtok(a) (((a) * NBPG) / 1024) + /* Print percentage cpu, resident set size. */ + tmp = pick->p_pctcpu * 10000 + FSCALE / 2 >> FSHIFT; + ttyprintf(tp, "%d%% %dk\n", + tmp / 100, + pick->p_stat == SIDL || pick->p_stat == SZOMB ? 0 : +#ifdef pmap_resident_count + pgtok(pmap_resident_count(&pick->p_vmspace->vm_pmap)) +#else + pgtok(pick->p_vmspace->vm_rssize) +#endif + ); + } + tp->t_rocount = 0; /* so pending input will be retyped if BS */ +} + +/* + * Returns 1 if p2 is "better" than p1 + * + * The algorithm for picking the "interesting" process is thus: + * + * 1) Only foreground processes are eligible - implied. + * 2) Runnable processes are favored over anything else. The runner + * with the highest cpu utilization is picked (p_estcpu). Ties are + * broken by picking the highest pid. + * 3) The sleeper with the shortest sleep time is next. With ties, + * we pick out just "short-term" sleepers (P_SINTR == 0). + * 4) Further ties are broken by picking the highest pid. + */ +#define ISRUN(p) (((p)->p_stat == SRUN) || ((p)->p_stat == SIDL)) +#define TESTAB(a, b) ((a)<<1 | (b)) +#define ONLYA 2 +#define ONLYB 1 +#define BOTH 3 + +static int +proc_compare(p1, p2) + register struct proc *p1, *p2; +{ + + if (p1 == NULL) + return (1); + /* + * see if at least one of them is runnable + */ + switch (TESTAB(ISRUN(p1), ISRUN(p2))) { + case ONLYA: + return (0); + case ONLYB: + return (1); + case BOTH: + /* + * tie - favor one with highest recent cpu utilization + */ + if (p2->p_estcpu > p1->p_estcpu) + return (1); + if (p1->p_estcpu > p2->p_estcpu) + return (0); + return (p2->p_pid > p1->p_pid); /* tie - return highest pid */ + } + /* + * weed out zombies + */ + switch (TESTAB(p1->p_stat == SZOMB, p2->p_stat == SZOMB)) { + case ONLYA: + return (1); + case ONLYB: + return (0); + case BOTH: + return (p2->p_pid > p1->p_pid); /* tie - return highest pid */ + } + /* + * pick the one with the smallest sleep time + */ + if (p2->p_slptime > p1->p_slptime) + return (0); + if (p1->p_slptime > p2->p_slptime) + return (1); + /* + * favor one sleeping in a non-interruptible sleep + */ + if (p1->p_flag & P_SINTR && (p2->p_flag & P_SINTR) == 0) + return (1); + if (p2->p_flag & P_SINTR && (p1->p_flag & P_SINTR) == 0) + return (0); + return (p2->p_pid > p1->p_pid); /* tie - return highest pid */ +} + +/* + * Output char to tty; console putchar style. + */ +int +tputchar(c, tp) + int c; + struct tty *tp; +{ + register int s; + + s = spltty(); + if (ISSET(tp->t_state, + TS_CARR_ON | TS_ISOPEN) != (TS_CARR_ON | TS_ISOPEN)) { + splx(s); + return (-1); + } + if (c == '\n') + (void)ttyoutput('\r', tp); + (void)ttyoutput(c, tp); + ttstart(tp); + splx(s); + return (0); +} + +/* + * Sleep on chan, returning ERESTART if tty changed while we napped and + * returning any errors (e.g. EINTR/ETIMEDOUT) reported by tsleep. If + * the tty is revoked, restarting a pending call will redo validation done + * at the start of the call. + */ +int +ttysleep(tp, chan, pri, wmesg, timo) + struct tty *tp; + void *chan; + int pri, timo; + char *wmesg; +{ + int error; + short gen; + + gen = tp->t_gen; + if (error = tsleep(chan, pri, wmesg, timo)) + return (error); + return (tp->t_gen == gen ? 0 : ERESTART); +} diff --git a/sys/kern/tty_compat.c b/sys/kern/tty_compat.c new file mode 100644 index 000000000000..ce95853a00eb --- /dev/null +++ b/sys/kern/tty_compat.c @@ -0,0 +1,411 @@ +/*- + * Copyright (c) 1982, 1986, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tty_compat.c 8.2 (Berkeley) 1/9/95 + */ + +/* + * mapping routines for old line discipline (yuck) + */ +#if defined(COMPAT_43) || defined(COMPAT_SUNOS) + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/ioctl.h> +#include <sys/proc.h> +#include <sys/tty.h> +#include <sys/termios.h> +#include <sys/file.h> +#include <sys/conf.h> +#include <sys/kernel.h> +#include <sys/syslog.h> + +int ttydebug = 0; + +static struct speedtab compatspeeds[] = { + { 38400, 15 }, + { 19200, 14 }, + { 9600, 13 }, + { 4800, 12 }, + { 2400, 11 }, + { 1800, 10 }, + { 1200, 9 }, + { 600, 8 }, + { 300, 7 }, + { 200, 6 }, + { 150, 5 }, + { 134, 4 }, + { 110, 3 }, + { 75, 2 }, + { 50, 1 }, + { 0, 0 }, + { -1, -1 }, +}; +static int compatspcodes[16] = { + 0, 50, 75, 110, 134, 150, 200, 300, 600, 1200, + 1800, 2400, 4800, 9600, 19200, 38400, +}; + +/*ARGSUSED*/ +ttcompat(tp, com, data, flag) + register struct tty *tp; + u_long com; + caddr_t data; + int flag; +{ + + switch (com) { + case TIOCGETP: { + register struct sgttyb *sg = (struct sgttyb *)data; + register u_char *cc = tp->t_cc; + register speed; + + speed = ttspeedtab(tp->t_ospeed, compatspeeds); + sg->sg_ospeed = (speed == -1) ? 15 : speed; + if (tp->t_ispeed == 0) + sg->sg_ispeed = sg->sg_ospeed; + else { + speed = ttspeedtab(tp->t_ispeed, compatspeeds); + sg->sg_ispeed = (speed == -1) ? 15 : speed; + } + sg->sg_erase = cc[VERASE]; + sg->sg_kill = cc[VKILL]; + sg->sg_flags = ttcompatgetflags(tp); + break; + } + + case TIOCSETP: + case TIOCSETN: { + register struct sgttyb *sg = (struct sgttyb *)data; + struct termios term; + int speed; + + term = tp->t_termios; + if ((speed = sg->sg_ispeed) > 15 || speed < 0) + term.c_ispeed = speed; + else + term.c_ispeed = compatspcodes[speed]; + if ((speed = sg->sg_ospeed) > 15 || speed < 0) + term.c_ospeed = speed; + else + term.c_ospeed = compatspcodes[speed]; + term.c_cc[VERASE] = sg->sg_erase; + term.c_cc[VKILL] = sg->sg_kill; + tp->t_flags = tp->t_flags&0xffff0000 | sg->sg_flags&0xffff; + ttcompatsetflags(tp, &term); + return (ttioctl(tp, com == TIOCSETP ? TIOCSETAF : TIOCSETA, + &term, flag)); + } + + case TIOCGETC: { + struct tchars *tc = (struct tchars *)data; + register u_char *cc = tp->t_cc; + + tc->t_intrc = cc[VINTR]; + tc->t_quitc = cc[VQUIT]; + tc->t_startc = cc[VSTART]; + tc->t_stopc = cc[VSTOP]; + tc->t_eofc = cc[VEOF]; + tc->t_brkc = cc[VEOL]; + break; + } + case TIOCSETC: { + struct tchars *tc = (struct tchars *)data; + register u_char *cc = tp->t_cc; + + cc[VINTR] = tc->t_intrc; + cc[VQUIT] = tc->t_quitc; + cc[VSTART] = tc->t_startc; + cc[VSTOP] = tc->t_stopc; + cc[VEOF] = tc->t_eofc; + cc[VEOL] = tc->t_brkc; + if (tc->t_brkc == -1) + cc[VEOL2] = _POSIX_VDISABLE; + break; + } + case TIOCSLTC: { + struct ltchars *ltc = (struct ltchars *)data; + register u_char *cc = tp->t_cc; + + cc[VSUSP] = ltc->t_suspc; + cc[VDSUSP] = ltc->t_dsuspc; + cc[VREPRINT] = ltc->t_rprntc; + cc[VDISCARD] = ltc->t_flushc; + cc[VWERASE] = ltc->t_werasc; + cc[VLNEXT] = ltc->t_lnextc; + break; + } + case TIOCGLTC: { + struct ltchars *ltc = (struct ltchars *)data; + register u_char *cc = tp->t_cc; + + ltc->t_suspc = cc[VSUSP]; + ltc->t_dsuspc = cc[VDSUSP]; + ltc->t_rprntc = cc[VREPRINT]; + ltc->t_flushc = cc[VDISCARD]; + ltc->t_werasc = cc[VWERASE]; + ltc->t_lnextc = cc[VLNEXT]; + break; + } + case TIOCLBIS: + case TIOCLBIC: + case TIOCLSET: { + struct termios term; + + term = tp->t_termios; + if (com == TIOCLSET) + tp->t_flags = (tp->t_flags&0xffff) | *(int *)data<<16; + else { + tp->t_flags = + (ttcompatgetflags(tp)&0xffff0000)|(tp->t_flags&0xffff); + if (com == TIOCLBIS) + tp->t_flags |= *(int *)data<<16; + else + tp->t_flags &= ~(*(int *)data<<16); + } + ttcompatsetlflags(tp, &term); + return (ttioctl(tp, TIOCSETA, &term, flag)); + } + case TIOCLGET: + *(int *)data = ttcompatgetflags(tp)>>16; + if (ttydebug) + printf("CLGET: returning %x\n", *(int *)data); + break; + + case OTIOCGETD: + *(int *)data = tp->t_line ? tp->t_line : 2; + break; + + case OTIOCSETD: { + int ldisczero = 0; + + return (ttioctl(tp, TIOCSETD, + *(int *)data == 2 ? (caddr_t)&ldisczero : data, flag)); + } + + case OTIOCCONS: + *(int *)data = 1; + return (ttioctl(tp, TIOCCONS, data, flag)); + + default: + return (-1); + } + return (0); +} + +ttcompatgetflags(tp) + register struct tty *tp; +{ + register long iflag = tp->t_iflag; + register long lflag = tp->t_lflag; + register long oflag = tp->t_oflag; + register long cflag = tp->t_cflag; + register flags = 0; + + if (iflag&IXOFF) + flags |= TANDEM; + if (iflag&ICRNL || oflag&ONLCR) + flags |= CRMOD; + if (cflag&PARENB) { + if (iflag&INPCK) { + if (cflag&PARODD) + flags |= ODDP; + else + flags |= EVENP; + } else + flags |= EVENP | ODDP; + } else { + if ((tp->t_flags&LITOUT) && !(oflag&OPOST)) + flags |= LITOUT; + if (tp->t_flags&PASS8) + flags |= PASS8; + } + + if ((lflag&ICANON) == 0) { + /* fudge */ + if (iflag&IXON || lflag&ISIG || lflag&IEXTEN || cflag&PARENB) + flags |= CBREAK; + else + flags |= RAW; + } + if (cflag&MDMBUF) + flags |= MDMBUF; + if ((cflag&HUPCL) == 0) + flags |= NOHANG; + if (oflag&OXTABS) + flags |= XTABS; + if (lflag&ECHOE) + flags |= CRTERA|CRTBS; + if (lflag&ECHOKE) + flags |= CRTKIL|CRTBS; + if (lflag&ECHOPRT) + flags |= PRTERA; + if (lflag&ECHOCTL) + flags |= CTLECH; + if ((iflag&IXANY) == 0) + flags |= DECCTQ; + flags |= lflag&(ECHO|TOSTOP|FLUSHO|PENDIN|NOFLSH); +if (ttydebug) + printf("getflags: %x\n", flags); + return (flags); +} + +ttcompatsetflags(tp, t) + register struct tty *tp; + register struct termios *t; +{ + register flags = tp->t_flags; + register long iflag = t->c_iflag; + register long oflag = t->c_oflag; + register long lflag = t->c_lflag; + register long cflag = t->c_cflag; + + if (flags & RAW) { + iflag &= IXOFF; + oflag &= ~OPOST; + lflag &= ~(ECHOCTL|ISIG|ICANON|IEXTEN); + } else { + iflag |= BRKINT|IXON|IMAXBEL; + oflag |= OPOST; + lflag |= ISIG|IEXTEN|ECHOCTL; /* XXX was echoctl on ? */ + if (flags & XTABS) + oflag |= OXTABS; + else + oflag &= ~OXTABS; + if (flags & CBREAK) + lflag &= ~ICANON; + else + lflag |= ICANON; + if (flags&CRMOD) { + iflag |= ICRNL; + oflag |= ONLCR; + } else { + iflag &= ~ICRNL; + oflag &= ~ONLCR; + } + } + if (flags&ECHO) + lflag |= ECHO; + else + lflag &= ~ECHO; + + if (flags&(RAW|LITOUT|PASS8)) { + cflag &= ~(CSIZE|PARENB); + cflag |= CS8; + if ((flags&(RAW|PASS8)) == 0) + iflag |= ISTRIP; + else + iflag &= ~ISTRIP; + } else { + cflag &= ~CSIZE; + cflag |= CS7|PARENB; + iflag |= ISTRIP; + } + if ((flags&(EVENP|ODDP)) == EVENP) { + iflag |= INPCK; + cflag &= ~PARODD; + } else if ((flags&(EVENP|ODDP)) == ODDP) { + iflag |= INPCK; + cflag |= PARODD; + } else + iflag &= ~INPCK; + if (flags&LITOUT) + oflag &= ~OPOST; /* move earlier ? */ + if (flags&TANDEM) + iflag |= IXOFF; + else + iflag &= ~IXOFF; + t->c_iflag = iflag; + t->c_oflag = oflag; + t->c_lflag = lflag; + t->c_cflag = cflag; +} + +ttcompatsetlflags(tp, t) + register struct tty *tp; + register struct termios *t; +{ + register flags = tp->t_flags; + register long iflag = t->c_iflag; + register long oflag = t->c_oflag; + register long lflag = t->c_lflag; + register long cflag = t->c_cflag; + + if (flags&CRTERA) + lflag |= ECHOE; + else + lflag &= ~ECHOE; + if (flags&CRTKIL) + lflag |= ECHOKE; + else + lflag &= ~ECHOKE; + if (flags&PRTERA) + lflag |= ECHOPRT; + else + lflag &= ~ECHOPRT; + if (flags&CTLECH) + lflag |= ECHOCTL; + else + lflag &= ~ECHOCTL; + if ((flags&DECCTQ) == 0) + iflag |= IXANY; + else + iflag &= ~IXANY; + if (flags & MDMBUF) + cflag |= MDMBUF; + else + cflag &= ~MDMBUF; + if (flags&NOHANG) + cflag &= ~HUPCL; + else + cflag |= HUPCL; + lflag &= ~(TOSTOP|FLUSHO|PENDIN|NOFLSH); + lflag |= flags&(TOSTOP|FLUSHO|PENDIN|NOFLSH); + if (flags&(LITOUT|PASS8)) { + iflag &= ~ISTRIP; + cflag &= ~(CSIZE|PARENB); + cflag |= CS8; + if (flags&LITOUT) + oflag &= ~OPOST; + if ((flags&(PASS8|RAW)) == 0) + iflag |= ISTRIP; + } else if ((flags&RAW) == 0) { + cflag &= ~CSIZE; + cflag |= CS7|PARENB; + oflag |= OPOST; + } + t->c_iflag = iflag; + t->c_oflag = oflag; + t->c_lflag = lflag; + t->c_cflag = cflag; +} +#endif /* COMPAT_43 || COMPAT_SUNOS */ diff --git a/sys/kern/tty_conf.c b/sys/kern/tty_conf.c new file mode 100644 index 000000000000..14536758116e --- /dev/null +++ b/sys/kern/tty_conf.c @@ -0,0 +1,126 @@ +/*- + * Copyright (c) 1982, 1986, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tty_conf.c 8.5 (Berkeley) 1/9/95 + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/buf.h> +#include <sys/ioctl.h> +#include <sys/proc.h> +#include <sys/tty.h> +#include <sys/conf.h> + +#define ttynodisc ((int (*) __P((dev_t, struct tty *)))enodev) +#define ttyerrclose ((int (*) __P((struct tty *, int flags)))enodev) +#define ttyerrio ((int (*) __P((struct tty *, struct uio *, int)))enodev) +#define ttyerrinput ((int (*) __P((int c, struct tty *)))enodev) +#define ttyerrstart ((int (*) __P((struct tty *)))enodev) + +int nullioctl __P((struct tty *tp, u_long cmd, caddr_t data, + int flag, struct proc *p)); + +#include "tb.h" +#if NTB > 0 +int tbopen __P((dev_t dev, struct tty *tp)); +int tbclose __P((struct tty *tp, int flags)); +int tbread __P((struct tty *, struct uio *, int flags)); +int tbioctl __P((struct tty *tp, u_long cmd, caddr_t data, + int flag, struct proc *p)); +int tbinput __P((int c, struct tty *tp)); +#endif + +#include "sl.h" +#if NSL > 0 +int slopen __P((dev_t dev, struct tty *tp)); +int slclose __P((struct tty *tp, int flags)); +int sltioctl __P((struct tty *tp, u_long cmd, caddr_t data, + int flag, struct proc *p)); +int slinput __P((int c, struct tty *tp)); +int slstart __P((struct tty *tp)); +#endif + + +struct linesw linesw[] = +{ + { ttyopen, ttylclose, ttread, ttwrite, nullioctl, + ttyinput, ttstart, ttymodem }, /* 0- termios */ + + { ttynodisc, ttyerrclose, ttyerrio, ttyerrio, nullioctl, + ttyerrinput, ttyerrstart, nullmodem }, /* 1- defunct */ + + { ttynodisc, ttyerrclose, ttyerrio, ttyerrio, nullioctl, + ttyerrinput, ttyerrstart, nullmodem }, /* 2- defunct */ + +#if NTB > 0 + { tbopen, tbclose, tbread, enodev, tbioctl, + tbinput, ttstart, nullmodem }, /* 3- TABLDISC */ +#else + { ttynodisc, ttyerrclose, ttyerrio, ttyerrio, nullioctl, + ttyerrinput, ttyerrstart, nullmodem }, +#endif + +#if NSL > 0 + { slopen, slclose, ttyerrio, ttyerrio, sltioctl, + slinput, slstart, nullmodem }, /* 4- SLIPDISC */ +#else + { ttynodisc, ttyerrclose, ttyerrio, ttyerrio, nullioctl, + ttyerrinput, ttyerrstart, nullmodem }, +#endif +}; + +int nlinesw = sizeof (linesw) / sizeof (linesw[0]); + +/* + * Do nothing specific version of line + * discipline specific ioctl command. + */ +/*ARGSUSED*/ +nullioctl(tp, cmd, data, flags, p) + struct tty *tp; + u_long cmd; + char *data; + int flags; + struct proc *p; +{ + +#ifdef lint + tp = tp; data = data; flags = flags; p = p; +#endif + return (-1); +} diff --git a/sys/kern/tty_pty.c b/sys/kern/tty_pty.c new file mode 100644 index 000000000000..2c37984ab03d --- /dev/null +++ b/sys/kern/tty_pty.c @@ -0,0 +1,691 @@ +/* + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tty_pty.c 8.4 (Berkeley) 2/20/95 + */ + +/* + * Pseudo-teletype Driver + * (Actually two drivers, requiring two entries in 'cdevsw') + */ +#include "pty.h" /* XXX */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/ioctl.h> +#include <sys/proc.h> +#include <sys/tty.h> +#include <sys/conf.h> +#include <sys/file.h> +#include <sys/uio.h> +#include <sys/kernel.h> +#include <sys/vnode.h> + +#if NPTY == 1 +#undef NPTY +#define NPTY 32 /* crude XXX */ +#endif + +#define BUFSIZ 100 /* Chunk size iomoved to/from user */ + +/* + * pts == /dev/tty[pqrs]? + * ptc == /dev/pty[pqrs]? + */ +struct tty pt_tty[NPTY]; /* XXX */ +struct pt_ioctl { + int pt_flags; + struct selinfo pt_selr, pt_selw; + u_char pt_send; + u_char pt_ucntl; +} pt_ioctl[NPTY]; /* XXX */ +int npty = NPTY; /* for pstat -t */ + +#define PF_PKT 0x08 /* packet mode */ +#define PF_STOPPED 0x10 /* user told stopped */ +#define PF_REMOTE 0x20 /* remote and flow controlled input */ +#define PF_NOSTOP 0x40 +#define PF_UCNTL 0x80 /* user control mode */ + +void ptsstop __P((struct tty *, int)); + +/* + * Establish n (or default if n is 1) ptys in the system. + * + * XXX cdevsw & pstat require the array `pty[]' to be an array + */ +void +ptyattach(n) + int n; +{ +#ifdef notyet + char *mem; + register u_long ntb; +#define DEFAULT_NPTY 32 + + /* maybe should allow 0 => none? */ + if (n <= 1) + n = DEFAULT_NPTY; + ntb = n * sizeof(struct tty); + mem = malloc(ntb + ALIGNBYTES + n * sizeof(struct pt_ioctl), + M_DEVBUF, M_WAITOK); + pt_tty = (struct tty *)mem; + mem = (char *)ALIGN(mem + ntb); + pt_ioctl = (struct pt_ioctl *)mem; + npty = n; +#endif +} + +/*ARGSUSED*/ +ptsopen(dev, flag, devtype, p) + dev_t dev; + int flag, devtype; + struct proc *p; +{ + register struct tty *tp; + int error; + + if (minor(dev) >= npty) + return (ENXIO); + tp = &pt_tty[minor(dev)]; + if ((tp->t_state & TS_ISOPEN) == 0) { + tp->t_state |= TS_WOPEN; + ttychars(tp); /* Set up default chars */ + tp->t_iflag = TTYDEF_IFLAG; + tp->t_oflag = TTYDEF_OFLAG; + tp->t_lflag = TTYDEF_LFLAG; + tp->t_cflag = TTYDEF_CFLAG; + tp->t_ispeed = tp->t_ospeed = TTYDEF_SPEED; + ttsetwater(tp); /* would be done in xxparam() */ + } else if (tp->t_state&TS_XCLUDE && p->p_ucred->cr_uid != 0) + return (EBUSY); + if (tp->t_oproc) /* Ctrlr still around. */ + tp->t_state |= TS_CARR_ON; + while ((tp->t_state & TS_CARR_ON) == 0) { + tp->t_state |= TS_WOPEN; + if (flag&FNONBLOCK) + break; + if (error = ttysleep(tp, (caddr_t)&tp->t_rawq, TTIPRI | PCATCH, + ttopen, 0)) + return (error); + } + error = (*linesw[tp->t_line].l_open)(dev, tp); + ptcwakeup(tp, FREAD|FWRITE); + return (error); +} + +ptsclose(dev, flag, mode, p) + dev_t dev; + int flag, mode; + struct proc *p; +{ + register struct tty *tp; + int err; + + tp = &pt_tty[minor(dev)]; + err = (*linesw[tp->t_line].l_close)(tp, flag); + err |= ttyclose(tp); + ptcwakeup(tp, FREAD|FWRITE); + return (err); +} + +ptsread(dev, uio, flag) + dev_t dev; + struct uio *uio; + int flag; +{ + struct proc *p = curproc; + register struct tty *tp = &pt_tty[minor(dev)]; + register struct pt_ioctl *pti = &pt_ioctl[minor(dev)]; + int error = 0; + +again: + if (pti->pt_flags & PF_REMOTE) { + while (isbackground(p, tp)) { + if ((p->p_sigignore & sigmask(SIGTTIN)) || + (p->p_sigmask & sigmask(SIGTTIN)) || + p->p_pgrp->pg_jobc == 0 || + p->p_flag & P_PPWAIT) + return (EIO); + pgsignal(p->p_pgrp, SIGTTIN, 1); + if (error = ttysleep(tp, (caddr_t)&lbolt, + TTIPRI | PCATCH, ttybg, 0)) + return (error); + } + if (tp->t_canq.c_cc == 0) { + if (flag & IO_NDELAY) + return (EWOULDBLOCK); + if (error = ttysleep(tp, (caddr_t)&tp->t_canq, + TTIPRI | PCATCH, ttyin, 0)) + return (error); + goto again; + } + while (tp->t_canq.c_cc > 1 && uio->uio_resid > 0) + if (ureadc(getc(&tp->t_canq), uio) < 0) { + error = EFAULT; + break; + } + if (tp->t_canq.c_cc == 1) + (void) getc(&tp->t_canq); + if (tp->t_canq.c_cc) + return (error); + } else + if (tp->t_oproc) + error = (*linesw[tp->t_line].l_read)(tp, uio, flag); + ptcwakeup(tp, FWRITE); + return (error); +} + +/* + * Write to pseudo-tty. + * Wakeups of controlling tty will happen + * indirectly, when tty driver calls ptsstart. + */ +ptswrite(dev, uio, flag) + dev_t dev; + struct uio *uio; + int flag; +{ + register struct tty *tp; + + tp = &pt_tty[minor(dev)]; + if (tp->t_oproc == 0) + return (EIO); + return ((*linesw[tp->t_line].l_write)(tp, uio, flag)); +} + +/* + * Start output on pseudo-tty. + * Wake up process selecting or sleeping for input from controlling tty. + */ +void +ptsstart(tp) + struct tty *tp; +{ + register struct pt_ioctl *pti = &pt_ioctl[minor(tp->t_dev)]; + + if (tp->t_state & TS_TTSTOP) + return; + if (pti->pt_flags & PF_STOPPED) { + pti->pt_flags &= ~PF_STOPPED; + pti->pt_send = TIOCPKT_START; + } + ptcwakeup(tp, FREAD); +} + +ptcwakeup(tp, flag) + struct tty *tp; + int flag; +{ + struct pt_ioctl *pti = &pt_ioctl[minor(tp->t_dev)]; + + if (flag & FREAD) { + selwakeup(&pti->pt_selr); + wakeup((caddr_t)&tp->t_outq.c_cf); + } + if (flag & FWRITE) { + selwakeup(&pti->pt_selw); + wakeup((caddr_t)&tp->t_rawq.c_cf); + } +} + +/*ARGSUSED*/ +#ifdef __STDC__ +ptcopen(dev_t dev, int flag, int devtype, struct proc *p) +#else +ptcopen(dev, flag, devtype, p) + dev_t dev; + int flag, devtype; + struct proc *p; +#endif +{ + register struct tty *tp; + struct pt_ioctl *pti; + + if (minor(dev) >= npty) + return (ENXIO); + tp = &pt_tty[minor(dev)]; + if (tp->t_oproc) + return (EIO); + tp->t_oproc = ptsstart; +#ifdef sun4c + tp->t_stop = ptsstop; +#endif + (void)(*linesw[tp->t_line].l_modem)(tp, 1); + tp->t_lflag &= ~EXTPROC; + pti = &pt_ioctl[minor(dev)]; + pti->pt_flags = 0; + pti->pt_send = 0; + pti->pt_ucntl = 0; + return (0); +} + +ptcclose(dev) + dev_t dev; +{ + register struct tty *tp; + + tp = &pt_tty[minor(dev)]; + (void)(*linesw[tp->t_line].l_modem)(tp, 0); + tp->t_state &= ~TS_CARR_ON; + tp->t_oproc = 0; /* mark closed */ + tp->t_session = 0; + return (0); +} + +ptcread(dev, uio, flag) + dev_t dev; + struct uio *uio; + int flag; +{ + register struct tty *tp = &pt_tty[minor(dev)]; + struct pt_ioctl *pti = &pt_ioctl[minor(dev)]; + char buf[BUFSIZ]; + int error = 0, cc; + + /* + * We want to block until the slave + * is open, and there's something to read; + * but if we lost the slave or we're NBIO, + * then return the appropriate error instead. + */ + for (;;) { + if (tp->t_state&TS_ISOPEN) { + if (pti->pt_flags&PF_PKT && pti->pt_send) { + error = ureadc((int)pti->pt_send, uio); + if (error) + return (error); + if (pti->pt_send & TIOCPKT_IOCTL) { + cc = min(uio->uio_resid, + sizeof(tp->t_termios)); + uiomove(&tp->t_termios, cc, uio); + } + pti->pt_send = 0; + return (0); + } + if (pti->pt_flags&PF_UCNTL && pti->pt_ucntl) { + error = ureadc((int)pti->pt_ucntl, uio); + if (error) + return (error); + pti->pt_ucntl = 0; + return (0); + } + if (tp->t_outq.c_cc && (tp->t_state&TS_TTSTOP) == 0) + break; + } + if ((tp->t_state&TS_CARR_ON) == 0) + return (0); /* EOF */ + if (flag & IO_NDELAY) + return (EWOULDBLOCK); + if (error = tsleep((caddr_t)&tp->t_outq.c_cf, TTIPRI | PCATCH, + ttyin, 0)) + return (error); + } + if (pti->pt_flags & (PF_PKT|PF_UCNTL)) + error = ureadc(0, uio); + while (uio->uio_resid > 0 && error == 0) { + cc = q_to_b(&tp->t_outq, buf, min(uio->uio_resid, BUFSIZ)); + if (cc <= 0) + break; + error = uiomove(buf, cc, uio); + } + if (tp->t_outq.c_cc <= tp->t_lowat) { + if (tp->t_state&TS_ASLEEP) { + tp->t_state &= ~TS_ASLEEP; + wakeup((caddr_t)&tp->t_outq); + } + selwakeup(&tp->t_wsel); + } + return (error); +} + +void +ptsstop(tp, flush) + register struct tty *tp; + int flush; +{ + struct pt_ioctl *pti = &pt_ioctl[minor(tp->t_dev)]; + int flag; + + /* note: FLUSHREAD and FLUSHWRITE already ok */ + if (flush == 0) { + flush = TIOCPKT_STOP; + pti->pt_flags |= PF_STOPPED; + } else + pti->pt_flags &= ~PF_STOPPED; + pti->pt_send |= flush; + /* change of perspective */ + flag = 0; + if (flush & FREAD) + flag |= FWRITE; + if (flush & FWRITE) + flag |= FREAD; + ptcwakeup(tp, flag); +} + +ptcselect(dev, rw, p) + dev_t dev; + int rw; + struct proc *p; +{ + register struct tty *tp = &pt_tty[minor(dev)]; + struct pt_ioctl *pti = &pt_ioctl[minor(dev)]; + int s; + + if ((tp->t_state&TS_CARR_ON) == 0) + return (1); + switch (rw) { + + case FREAD: + /* + * Need to block timeouts (ttrstart). + */ + s = spltty(); + if ((tp->t_state&TS_ISOPEN) && + tp->t_outq.c_cc && (tp->t_state&TS_TTSTOP) == 0) { + splx(s); + return (1); + } + splx(s); + /* FALLTHROUGH */ + + case 0: /* exceptional */ + if ((tp->t_state&TS_ISOPEN) && + (pti->pt_flags&PF_PKT && pti->pt_send || + pti->pt_flags&PF_UCNTL && pti->pt_ucntl)) + return (1); + selrecord(p, &pti->pt_selr); + break; + + + case FWRITE: + if (tp->t_state&TS_ISOPEN) { + if (pti->pt_flags & PF_REMOTE) { + if (tp->t_canq.c_cc == 0) + return (1); + } else { + if (tp->t_rawq.c_cc + tp->t_canq.c_cc < TTYHOG-2) + return (1); + if (tp->t_canq.c_cc == 0 && (tp->t_iflag&ICANON)) + return (1); + } + } + selrecord(p, &pti->pt_selw); + break; + + } + return (0); +} + +ptcwrite(dev, uio, flag) + dev_t dev; + register struct uio *uio; + int flag; +{ + register struct tty *tp = &pt_tty[minor(dev)]; + register u_char *cp; + register int cc = 0; + u_char locbuf[BUFSIZ]; + int cnt = 0; + struct pt_ioctl *pti = &pt_ioctl[minor(dev)]; + int error = 0; + +again: + if ((tp->t_state&TS_ISOPEN) == 0) + goto block; + if (pti->pt_flags & PF_REMOTE) { + if (tp->t_canq.c_cc) + goto block; + while (uio->uio_resid > 0 && tp->t_canq.c_cc < TTYHOG - 1) { + if (cc == 0) { + cc = min(uio->uio_resid, BUFSIZ); + cc = min(cc, TTYHOG - 1 - tp->t_canq.c_cc); + cp = locbuf; + error = uiomove((caddr_t)cp, cc, uio); + if (error) + return (error); + /* check again for safety */ + if ((tp->t_state&TS_ISOPEN) == 0) + return (EIO); + } + if (cc) + (void) b_to_q((char *)cp, cc, &tp->t_canq); + cc = 0; + } + (void) putc(0, &tp->t_canq); + ttwakeup(tp); + wakeup((caddr_t)&tp->t_canq); + return (0); + } + while (uio->uio_resid > 0) { + if (cc == 0) { + cc = min(uio->uio_resid, BUFSIZ); + cp = locbuf; + error = uiomove((caddr_t)cp, cc, uio); + if (error) + return (error); + /* check again for safety */ + if ((tp->t_state&TS_ISOPEN) == 0) + return (EIO); + } + while (cc > 0) { + if ((tp->t_rawq.c_cc + tp->t_canq.c_cc) >= TTYHOG - 2 && + (tp->t_canq.c_cc > 0 || !(tp->t_iflag&ICANON))) { + wakeup((caddr_t)&tp->t_rawq); + goto block; + } + (*linesw[tp->t_line].l_rint)(*cp++, tp); + cnt++; + cc--; + } + cc = 0; + } + return (0); +block: + /* + * Come here to wait for slave to open, for space + * in outq, or space in rawq. + */ + if ((tp->t_state&TS_CARR_ON) == 0) + return (EIO); + if (flag & IO_NDELAY) { + /* adjust for data copied in but not written */ + uio->uio_resid += cc; + if (cnt == 0) + return (EWOULDBLOCK); + return (0); + } + if (error = tsleep((caddr_t)&tp->t_rawq.c_cf, TTOPRI | PCATCH, + ttyout, 0)) { + /* adjust for data copied in but not written */ + uio->uio_resid += cc; + return (error); + } + goto again; +} + +/*ARGSUSED*/ +ptyioctl(dev, cmd, data, flag, p) + dev_t dev; + u_long cmd; + caddr_t data; + int flag; + struct proc *p; +{ + register struct tty *tp = &pt_tty[minor(dev)]; + register struct pt_ioctl *pti = &pt_ioctl[minor(dev)]; + register u_char *cc = tp->t_cc; + int stop, error; + + /* + * IF CONTROLLER STTY THEN MUST FLUSH TO PREVENT A HANG. + * ttywflush(tp) will hang if there are characters in the outq. + */ + if (cmd == TIOCEXT) { + /* + * When the EXTPROC bit is being toggled, we need + * to send an TIOCPKT_IOCTL if the packet driver + * is turned on. + */ + if (*(int *)data) { + if (pti->pt_flags & PF_PKT) { + pti->pt_send |= TIOCPKT_IOCTL; + ptcwakeup(tp, FREAD); + } + tp->t_lflag |= EXTPROC; + } else { + if ((tp->t_lflag & EXTPROC) && + (pti->pt_flags & PF_PKT)) { + pti->pt_send |= TIOCPKT_IOCTL; + ptcwakeup(tp, FREAD); + } + tp->t_lflag &= ~EXTPROC; + } + return(0); + } else + if (cdevsw[major(dev)].d_open == ptcopen) + switch (cmd) { + + case TIOCGPGRP: + /* + * We aviod calling ttioctl on the controller since, + * in that case, tp must be the controlling terminal. + */ + *(int *)data = tp->t_pgrp ? tp->t_pgrp->pg_id : 0; + return (0); + + case TIOCPKT: + if (*(int *)data) { + if (pti->pt_flags & PF_UCNTL) + return (EINVAL); + pti->pt_flags |= PF_PKT; + } else + pti->pt_flags &= ~PF_PKT; + return (0); + + case TIOCUCNTL: + if (*(int *)data) { + if (pti->pt_flags & PF_PKT) + return (EINVAL); + pti->pt_flags |= PF_UCNTL; + } else + pti->pt_flags &= ~PF_UCNTL; + return (0); + + case TIOCREMOTE: + if (*(int *)data) + pti->pt_flags |= PF_REMOTE; + else + pti->pt_flags &= ~PF_REMOTE; + ttyflush(tp, FREAD|FWRITE); + return (0); + +#ifdef COMPAT_43 + case TIOCSETP: + case TIOCSETN: +#endif + case TIOCSETD: + case TIOCSETA: + case TIOCSETAW: + case TIOCSETAF: + ndflush(&tp->t_outq, tp->t_outq.c_cc); + break; + + case TIOCSIG: + if (*(unsigned int *)data >= NSIG) + return(EINVAL); + if ((tp->t_lflag&NOFLSH) == 0) + ttyflush(tp, FREAD|FWRITE); + pgsignal(tp->t_pgrp, *(unsigned int *)data, 1); + if ((*(unsigned int *)data == SIGINFO) && + ((tp->t_lflag&NOKERNINFO) == 0)) + ttyinfo(tp); + return(0); + } + error = (*linesw[tp->t_line].l_ioctl)(tp, cmd, data, flag, p); + if (error < 0) + error = ttioctl(tp, cmd, data, flag); + if (error < 0) { + if (pti->pt_flags & PF_UCNTL && + (cmd & ~0xff) == UIOCCMD(0)) { + if (cmd & 0xff) { + pti->pt_ucntl = (u_char)cmd; + ptcwakeup(tp, FREAD); + } + return (0); + } + error = ENOTTY; + } + /* + * If external processing and packet mode send ioctl packet. + */ + if ((tp->t_lflag&EXTPROC) && (pti->pt_flags & PF_PKT)) { + switch(cmd) { + case TIOCSETA: + case TIOCSETAW: + case TIOCSETAF: +#ifdef COMPAT_43 + case TIOCSETP: + case TIOCSETN: +#endif +#if defined(COMPAT_43) || defined(COMPAT_SUNOS) + case TIOCSETC: + case TIOCSLTC: + case TIOCLBIS: + case TIOCLBIC: + case TIOCLSET: +#endif + pti->pt_send |= TIOCPKT_IOCTL; + ptcwakeup(tp, FREAD); + default: + break; + } + } + stop = (tp->t_iflag & IXON) && CCEQ(cc[VSTOP], CTRL('s')) + && CCEQ(cc[VSTART], CTRL('q')); + if (pti->pt_flags & PF_NOSTOP) { + if (stop) { + pti->pt_send &= ~TIOCPKT_NOSTOP; + pti->pt_send |= TIOCPKT_DOSTOP; + pti->pt_flags &= ~PF_NOSTOP; + ptcwakeup(tp, FREAD); + } + } else { + if (!stop) { + pti->pt_send &= ~TIOCPKT_DOSTOP; + pti->pt_send |= TIOCPKT_NOSTOP; + pti->pt_flags |= PF_NOSTOP; + ptcwakeup(tp, FREAD); + } + } + return (error); +} diff --git a/sys/kern/tty_tb.c b/sys/kern/tty_tb.c new file mode 100644 index 000000000000..05a46baa73d3 --- /dev/null +++ b/sys/kern/tty_tb.c @@ -0,0 +1,368 @@ +/*- + * Copyright (c) 1982, 1986, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tty_tb.c 8.2 (Berkeley) 1/9/95 + */ + +#include "tb.h" +#if NTB > 0 + +/* + * Line discipline for RS232 tablets; + * supplies binary coordinate data. + */ +#include <sys/param.h> +#include <sys/tablet.h> +#include <sys/tty.h> + +/* + * Tablet configuration table. + */ +struct tbconf { + short tbc_recsize; /* input record size in bytes */ + short tbc_uiosize; /* size of data record returned user */ + int tbc_sync; /* mask for finding sync byte/bit */ + int (*tbc_decode)();/* decoding routine */ + char *tbc_run; /* enter run mode sequence */ + char *tbc_point; /* enter point mode sequence */ + char *tbc_stop; /* stop sequence */ + char *tbc_start; /* start/restart sequence */ + int tbc_flags; +#define TBF_POL 0x1 /* polhemus hack */ +#define TBF_INPROX 0x2 /* tablet has proximity info */ +}; + +static int tbdecode(), gtcodecode(), poldecode(); +static int tblresdecode(), tbhresdecode(); + +struct tbconf tbconf[TBTYPE] = { +{ 0 }, +{ 5, sizeof (struct tbpos), 0200, tbdecode, "6", "4" }, +{ 5, sizeof (struct tbpos), 0200, tbdecode, "\1CN", "\1RT", "\2", "\4" }, +{ 8, sizeof (struct gtcopos), 0200, gtcodecode }, +{17, sizeof (struct polpos), 0200, poldecode, 0, 0, "\21", "\5\22\2\23", + TBF_POL }, +{ 5, sizeof (struct tbpos), 0100, tblresdecode, "\1CN", "\1PT", "\2", "\4", + TBF_INPROX }, +{ 6, sizeof (struct tbpos), 0200, tbhresdecode, "\1CN", "\1PT", "\2", "\4", + TBF_INPROX }, +{ 5, sizeof (struct tbpos), 0100, tblresdecode, "\1CL\33", "\1PT\33", 0, 0}, +{ 6, sizeof (struct tbpos), 0200, tbhresdecode, "\1CL\33", "\1PT\33", 0, 0}, +}; + +/* + * Tablet state + */ +struct tb { + int tbflags; /* mode & type bits */ +#define TBMAXREC 17 /* max input record size */ + char cbuf[TBMAXREC]; /* input buffer */ + union { + struct tbpos tbpos; + struct gtcopos gtcopos; + struct polpos polpos; + } rets; /* processed state */ +#define NTBS 16 +} tb[NTBS]; + +/* + * Open as tablet discipline; called on discipline change. + */ +/*ARGSUSED*/ +tbopen(dev, tp) + dev_t dev; + register struct tty *tp; +{ + register struct tb *tbp; + + if (tp->t_line == TABLDISC) + return (ENODEV); + ttywflush(tp); + for (tbp = tb; tbp < &tb[NTBS]; tbp++) + if (tbp->tbflags == 0) + break; + if (tbp >= &tb[NTBS]) + return (EBUSY); + tbp->tbflags = TBTIGER|TBPOINT; /* default */ + tp->t_cp = tbp->cbuf; + tp->t_inbuf = 0; + bzero((caddr_t)&tbp->rets, sizeof (tbp->rets)); + tp->T_LINEP = (caddr_t)tbp; + tp->t_flags |= LITOUT; + return (0); +} + +/* + * Line discipline change or last device close. + */ +tbclose(tp) + register struct tty *tp; +{ + register int s; + int modebits = TBPOINT|TBSTOP; + + tbioctl(tp, BIOSMODE, &modebits, 0); + s = spltty(); + ((struct tb *)tp->T_LINEP)->tbflags = 0; + tp->t_cp = 0; + tp->t_inbuf = 0; + tp->t_rawq.c_cc = 0; /* clear queues -- paranoid */ + tp->t_canq.c_cc = 0; + tp->t_line = 0; /* paranoid: avoid races */ + splx(s); +} + +/* + * Read from a tablet line. + * Characters have been buffered in a buffer and decoded. + */ +tbread(tp, uio) + register struct tty *tp; + struct uio *uio; +{ + register struct tb *tbp = (struct tb *)tp->T_LINEP; + register struct tbconf *tc = &tbconf[tbp->tbflags & TBTYPE]; + int ret; + + if ((tp->t_state&TS_CARR_ON) == 0) + return (EIO); + ret = uiomove(&tbp->rets, tc->tbc_uiosize, uio); + if (tc->tbc_flags&TBF_POL) + tbp->rets.polpos.p_key = ' '; + return (ret); +} + +/* + * Low level character input routine. + * Stuff the character in the buffer, and decode + * if all the chars are there. + * + * This routine could be expanded in-line in the receiver + * interrupt routine to make it run as fast as possible. + */ +tbinput(c, tp) + register int c; + register struct tty *tp; +{ + register struct tb *tbp = (struct tb *)tp->T_LINEP; + register struct tbconf *tc = &tbconf[tbp->tbflags & TBTYPE]; + + if (tc->tbc_recsize == 0 || tc->tbc_decode == 0) /* paranoid? */ + return; + /* + * Locate sync bit/byte or reset input buffer. + */ + if (c&tc->tbc_sync || tp->t_inbuf == tc->tbc_recsize) { + tp->t_cp = tbp->cbuf; + tp->t_inbuf = 0; + } + *tp->t_cp++ = c&0177; + /* + * Call decode routine only if a full record has been collected. + */ + if (++tp->t_inbuf == tc->tbc_recsize) + (*tc->tbc_decode)(tc, tbp->cbuf, &tbp->rets); +} + +/* + * Decode GTCO 8 byte format (high res, tilt, and pressure). + */ +static +gtcodecode(tc, cp, tbpos) + struct tbconf *tc; + register char *cp; + register struct gtcopos *tbpos; +{ + + tbpos->pressure = *cp >> 2; + tbpos->status = (tbpos->pressure > 16) | TBINPROX; /* half way down */ + tbpos->xpos = (*cp++ & 03) << 14; + tbpos->xpos |= *cp++ << 7; + tbpos->xpos |= *cp++; + tbpos->ypos = (*cp++ & 03) << 14; + tbpos->ypos |= *cp++ << 7; + tbpos->ypos |= *cp++; + tbpos->xtilt = *cp++; + tbpos->ytilt = *cp++; + tbpos->scount++; +} + +/* + * Decode old Hitachi 5 byte format (low res). + */ +static +tbdecode(tc, cp, tbpos) + struct tbconf *tc; + register char *cp; + register struct tbpos *tbpos; +{ + register char byte; + + byte = *cp++; + tbpos->status = (byte&0100) ? TBINPROX : 0; + byte &= ~0100; + if (byte > 036) + tbpos->status |= 1 << ((byte-040)/2); + tbpos->xpos = *cp++ << 7; + tbpos->xpos |= *cp++; + if (tbpos->xpos < 256) /* tablet wraps around at 256 */ + tbpos->status &= ~TBINPROX; /* make it out of proximity */ + tbpos->ypos = *cp++ << 7; + tbpos->ypos |= *cp++; + tbpos->scount++; +} + +/* + * Decode new Hitach 5-byte format (low res). + */ +static +tblresdecode(tc, cp, tbpos) + struct tbconf *tc; + register char *cp; + register struct tbpos *tbpos; +{ + + *cp &= ~0100; /* mask sync bit */ + tbpos->status = (*cp++ >> 2) | TBINPROX; + if (tc->tbc_flags&TBF_INPROX && tbpos->status&020) + tbpos->status &= ~(020|TBINPROX); + tbpos->xpos = *cp++; + tbpos->xpos |= *cp++ << 6; + tbpos->ypos = *cp++; + tbpos->ypos |= *cp++ << 6; + tbpos->scount++; +} + +/* + * Decode new Hitach 6-byte format (high res). + */ +static +tbhresdecode(tc, cp, tbpos) + struct tbconf *tc; + register char *cp; + register struct tbpos *tbpos; +{ + char byte; + + byte = *cp++; + tbpos->xpos = (byte & 03) << 14; + tbpos->xpos |= *cp++ << 7; + tbpos->xpos |= *cp++; + tbpos->ypos = *cp++ << 14; + tbpos->ypos |= *cp++ << 7; + tbpos->ypos |= *cp++; + tbpos->status = (byte >> 2) | TBINPROX; + if (tc->tbc_flags&TBF_INPROX && tbpos->status&020) + tbpos->status &= ~(020|TBINPROX); + tbpos->scount++; +} + +/* + * Polhemus decode. + */ +static +poldecode(tc, cp, polpos) + struct tbconf *tc; + register char *cp; + register struct polpos *polpos; +{ + + polpos->p_x = cp[4] | cp[3]<<7 | (cp[9] & 0x03) << 14; + polpos->p_y = cp[6] | cp[5]<<7 | (cp[9] & 0x0c) << 12; + polpos->p_z = cp[8] | cp[7]<<7 | (cp[9] & 0x30) << 10; + polpos->p_azi = cp[11] | cp[10]<<7 | (cp[16] & 0x03) << 14; + polpos->p_pit = cp[13] | cp[12]<<7 | (cp[16] & 0x0c) << 12; + polpos->p_rol = cp[15] | cp[14]<<7 | (cp[16] & 0x30) << 10; + polpos->p_stat = cp[1] | cp[0]<<7; + if (cp[2] != ' ') + polpos->p_key = cp[2]; +} + +/*ARGSUSED*/ +tbioctl(tp, cmd, data, flag) + struct tty *tp; + u_long cmd; + caddr_t data; + int flag; +{ + register struct tb *tbp = (struct tb *)tp->T_LINEP; + + switch (cmd) { + + case BIOGMODE: + *(int *)data = tbp->tbflags & TBMODE; + break; + + case BIOSTYPE: + if (tbconf[*(int *)data & TBTYPE].tbc_recsize == 0 || + tbconf[*(int *)data & TBTYPE].tbc_decode == 0) + return (EINVAL); + tbp->tbflags &= ~TBTYPE; + tbp->tbflags |= *(int *)data & TBTYPE; + /* fall thru... to set mode bits */ + + case BIOSMODE: { + register struct tbconf *tc; + + tbp->tbflags &= ~TBMODE; + tbp->tbflags |= *(int *)data & TBMODE; + tc = &tbconf[tbp->tbflags & TBTYPE]; + if (tbp->tbflags&TBSTOP) { + if (tc->tbc_stop) + ttyout(tc->tbc_stop, tp); + } else if (tc->tbc_start) + ttyout(tc->tbc_start, tp); + if (tbp->tbflags&TBPOINT) { + if (tc->tbc_point) + ttyout(tc->tbc_point, tp); + } else if (tc->tbc_run) + ttyout(tc->tbc_run, tp); + ttstart(tp); + break; + } + + case BIOGTYPE: + *(int *)data = tbp->tbflags & TBTYPE; + break; + + case TIOCSETD: + case TIOCGETD: + case TIOCGETP: + case TIOCGETC: + return (-1); /* pass thru... */ + + default: + return (ENOTTY); + } + return (0); +} +#endif diff --git a/sys/kern/tty_tty.c b/sys/kern/tty_tty.c new file mode 100644 index 000000000000..d9dd1b46fa59 --- /dev/null +++ b/sys/kern/tty_tty.c @@ -0,0 +1,149 @@ +/*- + * Copyright (c) 1982, 1986, 1991, 1993, 1995 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tty_tty.c 8.4 (Berkeley) 5/14/95 + */ + +/* + * Indirect driver for controlling tty. + */ +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/conf.h> +#include <sys/ioctl.h> +#include <sys/proc.h> +#include <sys/tty.h> +#include <sys/vnode.h> +#include <sys/file.h> + +#define cttyvp(p) ((p)->p_flag & P_CONTROLT ? (p)->p_session->s_ttyvp : NULL) + +/*ARGSUSED*/ +cttyopen(dev, flag, mode, p) + dev_t dev; + int flag, mode; + struct proc *p; +{ + struct vnode *ttyvp = cttyvp(p); + int error; + + if (ttyvp == NULL) + return (ENXIO); + vn_lock(ttyvp, LK_EXCLUSIVE | LK_RETRY, p); +#ifdef PARANOID + /* + * Since group is tty and mode is 620 on most terminal lines + * and since sessions protect terminals from processes outside + * your session, this check is probably no longer necessary. + * Since it inhibits setuid root programs that later switch + * to another user from accessing /dev/tty, we have decided + * to delete this test. (mckusick 5/93) + */ + error = VOP_ACCESS(ttyvp, + (flag&FREAD ? VREAD : 0) | (flag&FWRITE ? VWRITE : 0), p->p_ucred, p); + if (!error) +#endif /* PARANOID */ + error = VOP_OPEN(ttyvp, flag, NOCRED, p); + VOP_UNLOCK(ttyvp, 0, p); + return (error); +} + +/*ARGSUSED*/ +cttyread(dev, uio, flag) + dev_t dev; + struct uio *uio; + int flag; +{ + struct proc *p = uio->uio_procp; + register struct vnode *ttyvp = cttyvp(p); + int error; + + if (ttyvp == NULL) + return (EIO); + vn_lock(ttyvp, LK_EXCLUSIVE | LK_RETRY, p); + error = VOP_READ(ttyvp, uio, flag, NOCRED); + VOP_UNLOCK(ttyvp, 0, p); + return (error); +} + +/*ARGSUSED*/ +cttywrite(dev, uio, flag) + dev_t dev; + struct uio *uio; + int flag; +{ + struct proc *p = uio->uio_procp; + struct vnode *ttyvp = cttyvp(uio->uio_procp); + int error; + + if (ttyvp == NULL) + return (EIO); + vn_lock(ttyvp, LK_EXCLUSIVE | LK_RETRY, p); + error = VOP_WRITE(ttyvp, uio, flag, NOCRED); + VOP_UNLOCK(ttyvp, 0, p); + return (error); +} + +/*ARGSUSED*/ +cttyioctl(dev, cmd, addr, flag, p) + dev_t dev; + u_long cmd; + caddr_t addr; + int flag; + struct proc *p; +{ + struct vnode *ttyvp = cttyvp(p); + + if (ttyvp == NULL) + return (EIO); + if (cmd == TIOCNOTTY) { + if (!SESS_LEADER(p)) { + p->p_flag &= ~P_CONTROLT; + return (0); + } else + return (EINVAL); + } + return (VOP_IOCTL(ttyvp, cmd, addr, flag, NOCRED, p)); +} + +/*ARGSUSED*/ +cttyselect(dev, flag, p) + dev_t dev; + int flag; + struct proc *p; +{ + struct vnode *ttyvp = cttyvp(p); + + if (ttyvp == NULL) + return (1); /* try operation to get EOF/failure */ + return (VOP_SELECT(ttyvp, flag, FREAD|FWRITE, NOCRED, p)); +} diff --git a/sys/kern/uipc_domain.c b/sys/kern/uipc_domain.c new file mode 100644 index 000000000000..1c91f2af4a8c --- /dev/null +++ b/sys/kern/uipc_domain.c @@ -0,0 +1,223 @@ +/* + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)uipc_domain.c 8.3 (Berkeley) 2/14/95 + */ + +#include <sys/param.h> +#include <sys/socket.h> +#include <sys/protosw.h> +#include <sys/domain.h> +#include <sys/mbuf.h> +#include <sys/time.h> +#include <sys/kernel.h> +#include <sys/systm.h> +#include <sys/proc.h> +#include <vm/vm.h> +#include <sys/sysctl.h> + +void pffasttimo __P((void *)); +void pfslowtimo __P((void *)); + +#define ADDDOMAIN(x) { \ + extern struct domain __CONCAT(x,domain); \ + __CONCAT(x,domain.dom_next) = domains; \ + domains = &__CONCAT(x,domain); \ +} + +void +domaininit() +{ + register struct domain *dp; + register struct protosw *pr; + +#undef unix +#ifndef lint + ADDDOMAIN(unix); + ADDDOMAIN(route); +#ifdef INET + ADDDOMAIN(inet); +#endif +#ifdef NS + ADDDOMAIN(ns); +#endif +#ifdef ISO + ADDDOMAIN(iso); +#endif +#ifdef CCITT + ADDDOMAIN(ccitt); +#endif +#include "imp.h" +#if NIMP > 0 + ADDDOMAIN(imp); +#endif +#endif + + for (dp = domains; dp; dp = dp->dom_next) { + if (dp->dom_init) + (*dp->dom_init)(); + for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) + if (pr->pr_init) + (*pr->pr_init)(); + } + +if (max_linkhdr < 16) /* XXX */ +max_linkhdr = 16; + max_hdr = max_linkhdr + max_protohdr; + max_datalen = MHLEN - max_hdr; + timeout(pffasttimo, NULL, 1); + timeout(pfslowtimo, NULL, 1); +} + +struct protosw * +pffindtype(family, type) + int family, type; +{ + register struct domain *dp; + register struct protosw *pr; + + for (dp = domains; dp; dp = dp->dom_next) + if (dp->dom_family == family) + goto found; + return (0); +found: + for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) + if (pr->pr_type && pr->pr_type == type) + return (pr); + return (0); +} + +struct protosw * +pffindproto(family, protocol, type) + int family, protocol, type; +{ + register struct domain *dp; + register struct protosw *pr; + struct protosw *maybe = 0; + + if (family == 0) + return (0); + for (dp = domains; dp; dp = dp->dom_next) + if (dp->dom_family == family) + goto found; + return (0); +found: + for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) { + if ((pr->pr_protocol == protocol) && (pr->pr_type == type)) + return (pr); + + if (type == SOCK_RAW && pr->pr_type == SOCK_RAW && + pr->pr_protocol == 0 && maybe == (struct protosw *)0) + maybe = pr; + } + return (maybe); +} + +int +net_sysctl(name, namelen, oldp, oldlenp, newp, newlen, p) + int *name; + u_int namelen; + void *oldp; + size_t *oldlenp; + void *newp; + size_t newlen; + struct proc *p; +{ + register struct domain *dp; + register struct protosw *pr; + int family, protocol; + + /* + * All sysctl names at this level are nonterminal; + * next two components are protocol family and protocol number, + * then at least one addition component. + */ + if (namelen < 3) + return (EISDIR); /* overloaded */ + family = name[0]; + protocol = name[1]; + + if (family == 0) + return (0); + for (dp = domains; dp; dp = dp->dom_next) + if (dp->dom_family == family) + goto found; + return (ENOPROTOOPT); +found: + for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) + if (pr->pr_protocol == protocol && pr->pr_sysctl) + return ((*pr->pr_sysctl)(name + 2, namelen - 2, + oldp, oldlenp, newp, newlen)); + return (ENOPROTOOPT); +} + +void +pfctlinput(cmd, sa) + int cmd; + struct sockaddr *sa; +{ + register struct domain *dp; + register struct protosw *pr; + + for (dp = domains; dp; dp = dp->dom_next) + for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) + if (pr->pr_ctlinput) + (*pr->pr_ctlinput)(cmd, sa, (caddr_t)0); +} + +void +pfslowtimo(arg) + void *arg; +{ + register struct domain *dp; + register struct protosw *pr; + + for (dp = domains; dp; dp = dp->dom_next) + for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) + if (pr->pr_slowtimo) + (*pr->pr_slowtimo)(); + timeout(pfslowtimo, NULL, hz/2); +} + +void +pffasttimo(arg) + void *arg; +{ + register struct domain *dp; + register struct protosw *pr; + + for (dp = domains; dp; dp = dp->dom_next) + for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) + if (pr->pr_fasttimo) + (*pr->pr_fasttimo)(); + timeout(pffasttimo, NULL, hz/5); +} diff --git a/sys/kern/uipc_mbuf.c b/sys/kern/uipc_mbuf.c new file mode 100644 index 000000000000..62abfd5e171a --- /dev/null +++ b/sys/kern/uipc_mbuf.c @@ -0,0 +1,660 @@ +/* + * Copyright (c) 1982, 1986, 1988, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)uipc_mbuf.c 8.4 (Berkeley) 2/14/95 + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/proc.h> +#include <sys/malloc.h> +#include <sys/map.h> +#define MBTYPES +#include <sys/mbuf.h> +#include <sys/kernel.h> +#include <sys/syslog.h> +#include <sys/domain.h> +#include <sys/protosw.h> + +#include <vm/vm.h> + +extern vm_map_t mb_map; +struct mbuf *mbutl; +char *mclrefcnt; + +void +mbinit() +{ + int s; + + s = splimp(); + if (m_clalloc(max(4096/CLBYTES, 1), M_DONTWAIT) == 0) + goto bad; + splx(s); + return; +bad: + panic("mbinit"); +} + +/* + * Allocate some number of mbuf clusters + * and place on cluster free list. + * Must be called at splimp. + */ +/* ARGSUSED */ +int +m_clalloc(ncl, nowait) + register int ncl; + int nowait; +{ + static int logged; + register caddr_t p; + register int i; + int npg; + + npg = ncl * CLSIZE; + p = (caddr_t)kmem_malloc(mb_map, ctob(npg), !nowait); + if (p == NULL) { + if (logged == 0) { + logged++; + log(LOG_ERR, "mb_map full\n"); + } + return (0); + } + ncl = ncl * CLBYTES / MCLBYTES; + for (i = 0; i < ncl; i++) { + ((union mcluster *)p)->mcl_next = mclfree; + mclfree = (union mcluster *)p; + p += MCLBYTES; + mbstat.m_clfree++; + } + mbstat.m_clusters += ncl; + return (1); +} + +/* + * When MGET failes, ask protocols to free space when short of memory, + * then re-attempt to allocate an mbuf. + */ +struct mbuf * +m_retry(i, t) + int i, t; +{ + register struct mbuf *m; + + m_reclaim(); +#define m_retry(i, t) (struct mbuf *)0 + MGET(m, i, t); +#undef m_retry + return (m); +} + +/* + * As above; retry an MGETHDR. + */ +struct mbuf * +m_retryhdr(i, t) + int i, t; +{ + register struct mbuf *m; + + m_reclaim(); +#define m_retryhdr(i, t) (struct mbuf *)0 + MGETHDR(m, i, t); +#undef m_retryhdr + return (m); +} + +void +m_reclaim() +{ + register struct domain *dp; + register struct protosw *pr; + int s = splimp(); + + for (dp = domains; dp; dp = dp->dom_next) + for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) + if (pr->pr_drain) + (*pr->pr_drain)(); + splx(s); + mbstat.m_drain++; +} + +/* + * Space allocation routines. + * These are also available as macros + * for critical paths. + */ +struct mbuf * +m_get(nowait, type) + int nowait, type; +{ + register struct mbuf *m; + + MGET(m, nowait, type); + return (m); +} + +struct mbuf * +m_gethdr(nowait, type) + int nowait, type; +{ + register struct mbuf *m; + + MGETHDR(m, nowait, type); + return (m); +} + +struct mbuf * +m_getclr(nowait, type) + int nowait, type; +{ + register struct mbuf *m; + + MGET(m, nowait, type); + if (m == 0) + return (0); + bzero(mtod(m, caddr_t), MLEN); + return (m); +} + +struct mbuf * +m_free(m) + struct mbuf *m; +{ + register struct mbuf *n; + + MFREE(m, n); + return (n); +} + +void +m_freem(m) + register struct mbuf *m; +{ + register struct mbuf *n; + + if (m == NULL) + return; + do { + MFREE(m, n); + } while (m = n); +} + +/* + * Mbuffer utility routines. + */ + +/* + * Lesser-used path for M_PREPEND: + * allocate new mbuf to prepend to chain, + * copy junk along. + */ +struct mbuf * +m_prepend(m, len, how) + register struct mbuf *m; + int len, how; +{ + struct mbuf *mn; + + MGET(mn, how, m->m_type); + if (mn == (struct mbuf *)NULL) { + m_freem(m); + return ((struct mbuf *)NULL); + } + if (m->m_flags & M_PKTHDR) { + M_COPY_PKTHDR(mn, m); + m->m_flags &= ~M_PKTHDR; + } + mn->m_next = m; + m = mn; + if (len < MHLEN) + MH_ALIGN(m, len); + m->m_len = len; + return (m); +} + +/* + * Make a copy of an mbuf chain starting "off0" bytes from the beginning, + * continuing for "len" bytes. If len is M_COPYALL, copy to end of mbuf. + * The wait parameter is a choice of M_WAIT/M_DONTWAIT from caller. + */ +int MCFail; + +struct mbuf * +m_copym(m, off0, len, wait) + register struct mbuf *m; + int off0, wait; + register int len; +{ + register struct mbuf *n, **np; + register int off = off0; + struct mbuf *top; + int copyhdr = 0; + + if (off < 0 || len < 0) + panic("m_copym"); + if (off == 0 && m->m_flags & M_PKTHDR) + copyhdr = 1; + while (off > 0) { + if (m == 0) + panic("m_copym"); + if (off < m->m_len) + break; + off -= m->m_len; + m = m->m_next; + } + np = ⊤ + top = 0; + while (len > 0) { + if (m == 0) { + if (len != M_COPYALL) + panic("m_copym"); + break; + } + MGET(n, wait, m->m_type); + *np = n; + if (n == 0) + goto nospace; + if (copyhdr) { + M_COPY_PKTHDR(n, m); + if (len == M_COPYALL) + n->m_pkthdr.len -= off0; + else + n->m_pkthdr.len = len; + copyhdr = 0; + } + n->m_len = min(len, m->m_len - off); + if (m->m_flags & M_EXT) { + n->m_data = m->m_data + off; + mclrefcnt[mtocl(m->m_ext.ext_buf)]++; + n->m_ext = m->m_ext; + n->m_flags |= M_EXT; + } else + bcopy(mtod(m, caddr_t)+off, mtod(n, caddr_t), + (unsigned)n->m_len); + if (len != M_COPYALL) + len -= n->m_len; + off = 0; + m = m->m_next; + np = &n->m_next; + } + if (top == 0) + MCFail++; + return (top); +nospace: + m_freem(top); + MCFail++; + return (0); +} + +/* + * Copy data from an mbuf chain starting "off" bytes from the beginning, + * continuing for "len" bytes, into the indicated buffer. + */ +void +m_copydata(m, off, len, cp) + register struct mbuf *m; + register int off; + register int len; + caddr_t cp; +{ + register unsigned count; + + if (off < 0 || len < 0) + panic("m_copydata"); + while (off > 0) { + if (m == 0) + panic("m_copydata"); + if (off < m->m_len) + break; + off -= m->m_len; + m = m->m_next; + } + while (len > 0) { + if (m == 0) + panic("m_copydata"); + count = min(m->m_len - off, len); + bcopy(mtod(m, caddr_t) + off, cp, count); + len -= count; + cp += count; + off = 0; + m = m->m_next; + } +} + +/* + * Concatenate mbuf chain n to m. + * Both chains must be of the same type (e.g. MT_DATA). + * Any m_pkthdr is not updated. + */ +void +m_cat(m, n) + register struct mbuf *m, *n; +{ + while (m->m_next) + m = m->m_next; + while (n) { + if (m->m_flags & M_EXT || + m->m_data + m->m_len + n->m_len >= &m->m_dat[MLEN]) { + /* just join the two chains */ + m->m_next = n; + return; + } + /* splat the data from one into the other */ + bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len, + (u_int)n->m_len); + m->m_len += n->m_len; + n = m_free(n); + } +} + +void +m_adj(mp, req_len) + struct mbuf *mp; + int req_len; +{ + register int len = req_len; + register struct mbuf *m; + register count; + + if ((m = mp) == NULL) + return; + if (len >= 0) { + /* + * Trim from head. + */ + while (m != NULL && len > 0) { + if (m->m_len <= len) { + len -= m->m_len; + m->m_len = 0; + m = m->m_next; + } else { + m->m_len -= len; + m->m_data += len; + len = 0; + } + } + m = mp; + if (mp->m_flags & M_PKTHDR) + m->m_pkthdr.len -= (req_len - len); + } else { + /* + * Trim from tail. Scan the mbuf chain, + * calculating its length and finding the last mbuf. + * If the adjustment only affects this mbuf, then just + * adjust and return. Otherwise, rescan and truncate + * after the remaining size. + */ + len = -len; + count = 0; + for (;;) { + count += m->m_len; + if (m->m_next == (struct mbuf *)0) + break; + m = m->m_next; + } + if (m->m_len >= len) { + m->m_len -= len; + if (mp->m_flags & M_PKTHDR) + mp->m_pkthdr.len -= len; + return; + } + count -= len; + if (count < 0) + count = 0; + /* + * Correct length for chain is "count". + * Find the mbuf with last data, adjust its length, + * and toss data from remaining mbufs on chain. + */ + m = mp; + if (m->m_flags & M_PKTHDR) + m->m_pkthdr.len = count; + for (; m; m = m->m_next) { + if (m->m_len >= count) { + m->m_len = count; + break; + } + count -= m->m_len; + } + while (m = m->m_next) + m->m_len = 0; + } +} + +/* + * Rearange an mbuf chain so that len bytes are contiguous + * and in the data area of an mbuf (so that mtod and dtom + * will work for a structure of size len). Returns the resulting + * mbuf chain on success, frees it and returns null on failure. + * If there is room, it will add up to max_protohdr-len extra bytes to the + * contiguous region in an attempt to avoid being called next time. + */ +int MPFail; + +struct mbuf * +m_pullup(n, len) + register struct mbuf *n; + int len; +{ + register struct mbuf *m; + register int count; + int space; + + /* + * If first mbuf has no cluster, and has room for len bytes + * without shifting current data, pullup into it, + * otherwise allocate a new mbuf to prepend to the chain. + */ + if ((n->m_flags & M_EXT) == 0 && + n->m_data + len < &n->m_dat[MLEN] && n->m_next) { + if (n->m_len >= len) + return (n); + m = n; + n = n->m_next; + len -= m->m_len; + } else { + if (len > MHLEN) + goto bad; + MGET(m, M_DONTWAIT, n->m_type); + if (m == 0) + goto bad; + m->m_len = 0; + if (n->m_flags & M_PKTHDR) { + M_COPY_PKTHDR(m, n); + n->m_flags &= ~M_PKTHDR; + } + } + space = &m->m_dat[MLEN] - (m->m_data + m->m_len); + do { + count = min(min(max(len, max_protohdr), space), n->m_len); + bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len, + (unsigned)count); + len -= count; + m->m_len += count; + n->m_len -= count; + space -= count; + if (n->m_len) + n->m_data += count; + else + n = m_free(n); + } while (len > 0 && n); + if (len > 0) { + (void) m_free(m); + goto bad; + } + m->m_next = n; + return (m); +bad: + m_freem(n); + MPFail++; + return (0); +} + +/* + * Partition an mbuf chain in two pieces, returning the tail -- + * all but the first len0 bytes. In case of failure, it returns NULL and + * attempts to restore the chain to its original state. + */ +struct mbuf * +m_split(m0, len0, wait) + register struct mbuf *m0; + int len0, wait; +{ + register struct mbuf *m, *n; + unsigned len = len0, remain; + + for (m = m0; m && len > m->m_len; m = m->m_next) + len -= m->m_len; + if (m == 0) + return (0); + remain = m->m_len - len; + if (m0->m_flags & M_PKTHDR) { + MGETHDR(n, wait, m0->m_type); + if (n == 0) + return (0); + n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif; + n->m_pkthdr.len = m0->m_pkthdr.len - len0; + m0->m_pkthdr.len = len0; + if (m->m_flags & M_EXT) + goto extpacket; + if (remain > MHLEN) { + /* m can't be the lead packet */ + MH_ALIGN(n, 0); + n->m_next = m_split(m, len, wait); + if (n->m_next == 0) { + (void) m_free(n); + return (0); + } else + return (n); + } else + MH_ALIGN(n, remain); + } else if (remain == 0) { + n = m->m_next; + m->m_next = 0; + return (n); + } else { + MGET(n, wait, m->m_type); + if (n == 0) + return (0); + M_ALIGN(n, remain); + } +extpacket: + if (m->m_flags & M_EXT) { + n->m_flags |= M_EXT; + n->m_ext = m->m_ext; + mclrefcnt[mtocl(m->m_ext.ext_buf)]++; + m->m_ext.ext_size = 0; /* For Accounting XXXXXX danger */ + n->m_data = m->m_data + len; + } else { + bcopy(mtod(m, caddr_t) + len, mtod(n, caddr_t), remain); + } + n->m_len = remain; + m->m_len = len; + n->m_next = m->m_next; + m->m_next = 0; + return (n); +} +/* + * Routine to copy from device local memory into mbufs. + */ +struct mbuf * +m_devget(buf, totlen, off0, ifp, copy) + char *buf; + int totlen, off0; + struct ifnet *ifp; + void (*copy)(); +{ + register struct mbuf *m; + struct mbuf *top = 0, **mp = ⊤ + register int off = off0, len; + register char *cp; + char *epkt; + + cp = buf; + epkt = cp + totlen; + if (off) { + /* + * If 'off' is non-zero, packet is trailer-encapsulated, + * so we have to skip the type and length fields. + */ + cp += off + 2 * sizeof(u_int16_t); + totlen -= 2 * sizeof(u_int16_t); + } + MGETHDR(m, M_DONTWAIT, MT_DATA); + if (m == 0) + return (0); + m->m_pkthdr.rcvif = ifp; + m->m_pkthdr.len = totlen; + m->m_len = MHLEN; + + while (totlen > 0) { + if (top) { + MGET(m, M_DONTWAIT, MT_DATA); + if (m == 0) { + m_freem(top); + return (0); + } + m->m_len = MLEN; + } + len = min(totlen, epkt - cp); + if (len >= MINCLSIZE) { + MCLGET(m, M_DONTWAIT); + if (m->m_flags & M_EXT) + m->m_len = len = min(len, MCLBYTES); + else + len = m->m_len; + } else { + /* + * Place initial small packet/header at end of mbuf. + */ + if (len < m->m_len) { + if (top == 0 && len + max_linkhdr <= m->m_len) + m->m_data += max_linkhdr; + m->m_len = len; + } else + len = m->m_len; + } + if (copy) + copy(cp, mtod(m, caddr_t), (unsigned)len); + else + bcopy(cp, mtod(m, caddr_t), (unsigned)len); + cp += len; + *mp = m; + mp = &m->m_next; + totlen -= len; + if (cp == epkt) + cp = buf; + } + return (top); +} diff --git a/sys/kern/uipc_proto.c b/sys/kern/uipc_proto.c new file mode 100644 index 000000000000..e89a84c2f51e --- /dev/null +++ b/sys/kern/uipc_proto.c @@ -0,0 +1,72 @@ +/*- + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)uipc_proto.c 8.2 (Berkeley) 2/14/95 + */ + +#include <sys/param.h> +#include <sys/socket.h> +#include <sys/protosw.h> +#include <sys/domain.h> +#include <sys/mbuf.h> + +/* + * Definitions of protocols supported in the UNIX domain. + */ + +int uipc_usrreq(), raw_usrreq(); +void raw_init(), raw_input(), raw_ctlinput(); +extern struct domain unixdomain; /* or at least forward */ + +struct protosw unixsw[] = { +{ SOCK_STREAM, &unixdomain, 0, PR_CONNREQUIRED|PR_WANTRCVD|PR_RIGHTS, + 0, 0, 0, 0, + uipc_usrreq, + 0, 0, 0, 0, +}, +{ SOCK_DGRAM, &unixdomain, 0, PR_ATOMIC|PR_ADDR|PR_RIGHTS, + 0, 0, 0, 0, + uipc_usrreq, + 0, 0, 0, 0, +}, +{ 0, 0, 0, 0, + raw_input, 0, raw_ctlinput, 0, + raw_usrreq, + raw_init, 0, 0, 0, +} +}; + +int unp_externalize(), unp_dispose(); + +struct domain unixdomain = + { AF_UNIX, "unix", 0, unp_externalize, unp_dispose, + unixsw, &unixsw[sizeof(unixsw)/sizeof(unixsw[0])] }; diff --git a/sys/kern/uipc_socket.c b/sys/kern/uipc_socket.c new file mode 100644 index 000000000000..a9c5453e96ed --- /dev/null +++ b/sys/kern/uipc_socket.c @@ -0,0 +1,1040 @@ +/* + * Copyright (c) 1982, 1986, 1988, 1990, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)uipc_socket.c 8.6 (Berkeley) 5/2/95 + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/proc.h> +#include <sys/file.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/domain.h> +#include <sys/kernel.h> +#include <sys/protosw.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/resourcevar.h> + +/* + * Socket operation routines. + * These routines are called by the routines in + * sys_socket.c or from a system process, and + * implement the semantics of socket operations by + * switching out to the protocol specific routines. + */ +/*ARGSUSED*/ +int +socreate(dom, aso, type, proto) + int dom; + struct socket **aso; + register int type; + int proto; +{ + struct proc *p = curproc; /* XXX */ + register struct protosw *prp; + register struct socket *so; + register int error; + + if (proto) + prp = pffindproto(dom, proto, type); + else + prp = pffindtype(dom, type); + if (prp == 0 || prp->pr_usrreq == 0) + return (EPROTONOSUPPORT); + if (prp->pr_type != type) + return (EPROTOTYPE); + MALLOC(so, struct socket *, sizeof(*so), M_SOCKET, M_WAIT); + bzero((caddr_t)so, sizeof(*so)); + so->so_type = type; + if (p->p_ucred->cr_uid == 0) + so->so_state = SS_PRIV; + so->so_proto = prp; + error = (*prp->pr_usrreq)(so, PRU_ATTACH, (struct mbuf *)0, + (struct mbuf *)(long)proto, (struct mbuf *)0); + if (error) { + so->so_state |= SS_NOFDREF; + sofree(so); + return (error); + } + *aso = so; + return (0); +} + +int +sobind(so, nam) + struct socket *so; + struct mbuf *nam; +{ + int s = splnet(); + int error; + + error = + (*so->so_proto->pr_usrreq)(so, PRU_BIND, + (struct mbuf *)0, nam, (struct mbuf *)0); + splx(s); + return (error); +} + +int +solisten(so, backlog) + register struct socket *so; + int backlog; +{ + int s = splnet(), error; + + error = + (*so->so_proto->pr_usrreq)(so, PRU_LISTEN, + (struct mbuf *)0, (struct mbuf *)0, (struct mbuf *)0); + if (error) { + splx(s); + return (error); + } + if (so->so_q == 0) + so->so_options |= SO_ACCEPTCONN; + if (backlog < 0) + backlog = 0; + so->so_qlimit = min(backlog, SOMAXCONN); + splx(s); + return (0); +} + +int +sofree(so) + register struct socket *so; +{ + + if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0) + return; + if (so->so_head) { + if (!soqremque(so, 0) && !soqremque(so, 1)) + panic("sofree dq"); + so->so_head = 0; + } + sbrelease(&so->so_snd); + sorflush(so); + FREE(so, M_SOCKET); +} + +/* + * Close a socket on last file table reference removal. + * Initiate disconnect if connected. + * Free socket when disconnect complete. + */ +int +soclose(so) + register struct socket *so; +{ + int s = splnet(); /* conservative */ + int error = 0; + + if (so->so_options & SO_ACCEPTCONN) { + while (so->so_q0) + (void) soabort(so->so_q0); + while (so->so_q) + (void) soabort(so->so_q); + } + if (so->so_pcb == 0) + goto discard; + if (so->so_state & SS_ISCONNECTED) { + if ((so->so_state & SS_ISDISCONNECTING) == 0) { + error = sodisconnect(so); + if (error) + goto drop; + } + if (so->so_options & SO_LINGER) { + if ((so->so_state & SS_ISDISCONNECTING) && + (so->so_state & SS_NBIO)) + goto drop; + while (so->so_state & SS_ISCONNECTED) + if (error = tsleep((caddr_t)&so->so_timeo, + PSOCK | PCATCH, netcls, so->so_linger * hz)) + break; + } + } +drop: + if (so->so_pcb) { + int error2 = + (*so->so_proto->pr_usrreq)(so, PRU_DETACH, + (struct mbuf *)0, (struct mbuf *)0, (struct mbuf *)0); + if (error == 0) + error = error2; + } +discard: + if (so->so_state & SS_NOFDREF) + panic("soclose: NOFDREF"); + so->so_state |= SS_NOFDREF; + sofree(so); + splx(s); + return (error); +} + +/* + * Must be called at splnet... + */ +int +soabort(so) + struct socket *so; +{ + + return ( + (*so->so_proto->pr_usrreq)(so, PRU_ABORT, + (struct mbuf *)0, (struct mbuf *)0, (struct mbuf *)0)); +} + +int +soaccept(so, nam) + register struct socket *so; + struct mbuf *nam; +{ + int s = splnet(); + int error; + + if ((so->so_state & SS_NOFDREF) == 0) + panic("soaccept: !NOFDREF"); + so->so_state &= ~SS_NOFDREF; + error = (*so->so_proto->pr_usrreq)(so, PRU_ACCEPT, + (struct mbuf *)0, nam, (struct mbuf *)0); + splx(s); + return (error); +} + +int +soconnect(so, nam) + register struct socket *so; + struct mbuf *nam; +{ + int s; + int error; + + if (so->so_options & SO_ACCEPTCONN) + return (EOPNOTSUPP); + s = splnet(); + /* + * If protocol is connection-based, can only connect once. + * Otherwise, if connected, try to disconnect first. + * This allows user to disconnect by connecting to, e.g., + * a null address. + */ + if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) && + ((so->so_proto->pr_flags & PR_CONNREQUIRED) || + (error = sodisconnect(so)))) + error = EISCONN; + else + error = (*so->so_proto->pr_usrreq)(so, PRU_CONNECT, + (struct mbuf *)0, nam, (struct mbuf *)0); + splx(s); + return (error); +} + +int +soconnect2(so1, so2) + register struct socket *so1; + struct socket *so2; +{ + int s = splnet(); + int error; + + error = (*so1->so_proto->pr_usrreq)(so1, PRU_CONNECT2, + (struct mbuf *)0, (struct mbuf *)so2, (struct mbuf *)0); + splx(s); + return (error); +} + +int +sodisconnect(so) + register struct socket *so; +{ + int s = splnet(); + int error; + + if ((so->so_state & SS_ISCONNECTED) == 0) { + error = ENOTCONN; + goto bad; + } + if (so->so_state & SS_ISDISCONNECTING) { + error = EALREADY; + goto bad; + } + error = (*so->so_proto->pr_usrreq)(so, PRU_DISCONNECT, + (struct mbuf *)0, (struct mbuf *)0, (struct mbuf *)0); +bad: + splx(s); + return (error); +} + +#define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK) +/* + * Send on a socket. + * If send must go all at once and message is larger than + * send buffering, then hard error. + * Lock against other senders. + * If must go all at once and not enough room now, then + * inform user that this would block and do nothing. + * Otherwise, if nonblocking, send as much as possible. + * The data to be sent is described by "uio" if nonzero, + * otherwise by the mbuf chain "top" (which must be null + * if uio is not). Data provided in mbuf chain must be small + * enough to send all at once. + * + * Returns nonzero on error, timeout or signal; callers + * must check for short counts if EINTR/ERESTART are returned. + * Data and control buffers are freed on return. + */ +int +sosend(so, addr, uio, top, control, flags) + register struct socket *so; + struct mbuf *addr; + struct uio *uio; + struct mbuf *top; + struct mbuf *control; + int flags; +{ + struct proc *p = curproc; /* XXX */ + struct mbuf **mp; + register struct mbuf *m; + register long space, len, resid; + int clen = 0, error, s, dontroute, mlen; + int atomic = sosendallatonce(so) || top; + + if (uio) + resid = uio->uio_resid; + else + resid = top->m_pkthdr.len; + /* + * In theory resid should be unsigned. + * However, space must be signed, as it might be less than 0 + * if we over-committed, and we must use a signed comparison + * of space and resid. On the other hand, a negative resid + * causes us to loop sending 0-length segments to the protocol. + */ + if (resid < 0) + return (EINVAL); + dontroute = + (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 && + (so->so_proto->pr_flags & PR_ATOMIC); + p->p_stats->p_ru.ru_msgsnd++; + if (control) + clen = control->m_len; +#define snderr(errno) { error = errno; splx(s); goto release; } + +restart: + if (error = sblock(&so->so_snd, SBLOCKWAIT(flags))) + goto out; + do { + s = splnet(); + if (so->so_state & SS_CANTSENDMORE) + snderr(EPIPE); + if (so->so_error) + snderr(so->so_error); + if ((so->so_state & SS_ISCONNECTED) == 0) { + if (so->so_proto->pr_flags & PR_CONNREQUIRED) { + if ((so->so_state & SS_ISCONFIRMING) == 0 && + !(resid == 0 && clen != 0)) + snderr(ENOTCONN); + } else if (addr == 0) + snderr(EDESTADDRREQ); + } + space = sbspace(&so->so_snd); + if (flags & MSG_OOB) + space += 1024; + if (atomic && resid > so->so_snd.sb_hiwat || + clen > so->so_snd.sb_hiwat) + snderr(EMSGSIZE); + if (space < resid + clen && uio && + (atomic || space < so->so_snd.sb_lowat || space < clen)) { + if (so->so_state & SS_NBIO) + snderr(EWOULDBLOCK); + sbunlock(&so->so_snd); + error = sbwait(&so->so_snd); + splx(s); + if (error) + goto out; + goto restart; + } + splx(s); + mp = ⊤ + space -= clen; + do { + if (uio == NULL) { + /* + * Data is prepackaged in "top". + */ + resid = 0; + if (flags & MSG_EOR) + top->m_flags |= M_EOR; + } else do { + if (top == 0) { + MGETHDR(m, M_WAIT, MT_DATA); + mlen = MHLEN; + m->m_pkthdr.len = 0; + m->m_pkthdr.rcvif = (struct ifnet *)0; + } else { + MGET(m, M_WAIT, MT_DATA); + mlen = MLEN; + } + if (resid >= MINCLSIZE && space >= MCLBYTES) { + MCLGET(m, M_WAIT); + if ((m->m_flags & M_EXT) == 0) + goto nopages; + mlen = MCLBYTES; +#ifdef MAPPED_MBUFS + len = min(MCLBYTES, resid); +#else + if (atomic && top == 0) { + len = min(MCLBYTES - max_hdr, resid); + m->m_data += max_hdr; + } else + len = min(MCLBYTES, resid); +#endif + space -= MCLBYTES; + } else { +nopages: + len = min(min(mlen, resid), space); + space -= len; + /* + * For datagram protocols, leave room + * for protocol headers in first mbuf. + */ + if (atomic && top == 0 && len < mlen) + MH_ALIGN(m, len); + } + error = uiomove(mtod(m, caddr_t), (int)len, uio); + resid = uio->uio_resid; + m->m_len = len; + *mp = m; + top->m_pkthdr.len += len; + if (error) + goto release; + mp = &m->m_next; + if (resid <= 0) { + if (flags & MSG_EOR) + top->m_flags |= M_EOR; + break; + } + } while (space > 0 && atomic); + if (dontroute) + so->so_options |= SO_DONTROUTE; + s = splnet(); /* XXX */ + error = (*so->so_proto->pr_usrreq)(so, + (flags & MSG_OOB) ? PRU_SENDOOB : PRU_SEND, + top, addr, control); + splx(s); + if (dontroute) + so->so_options &= ~SO_DONTROUTE; + clen = 0; + control = 0; + top = 0; + mp = ⊤ + if (error) + goto release; + } while (resid && space > 0); + } while (resid); + +release: + sbunlock(&so->so_snd); +out: + if (top) + m_freem(top); + if (control) + m_freem(control); + return (error); +} + +/* + * Implement receive operations on a socket. + * We depend on the way that records are added to the sockbuf + * by sbappend*. In particular, each record (mbufs linked through m_next) + * must begin with an address if the protocol so specifies, + * followed by an optional mbuf or mbufs containing ancillary data, + * and then zero or more mbufs of data. + * In order to avoid blocking network interrupts for the entire time here, + * we splx() while doing the actual copy to user space. + * Although the sockbuf is locked, new data may still be appended, + * and thus we must maintain consistency of the sockbuf during that time. + * + * The caller may receive the data as a single mbuf chain by supplying + * an mbuf **mp0 for use in returning the chain. The uio is then used + * only for the count in uio_resid. + */ +int +soreceive(so, paddr, uio, mp0, controlp, flagsp) + register struct socket *so; + struct mbuf **paddr; + struct uio *uio; + struct mbuf **mp0; + struct mbuf **controlp; + int *flagsp; +{ + register struct mbuf *m, **mp; + register int flags, len, error, s, offset; + struct protosw *pr = so->so_proto; + struct mbuf *nextrecord; + int moff, type; + int orig_resid = uio->uio_resid; + + mp = mp0; + if (paddr) + *paddr = 0; + if (controlp) + *controlp = 0; + if (flagsp) + flags = *flagsp &~ MSG_EOR; + else + flags = 0; + if (flags & MSG_OOB) { + m = m_get(M_WAIT, MT_DATA); + error = (*pr->pr_usrreq)(so, PRU_RCVOOB, m, + (struct mbuf *)(long)(flags & MSG_PEEK), (struct mbuf *)0); + if (error) + goto bad; + do { + error = uiomove(mtod(m, caddr_t), + (int) min(uio->uio_resid, m->m_len), uio); + m = m_free(m); + } while (uio->uio_resid && error == 0 && m); +bad: + if (m) + m_freem(m); + return (error); + } + if (mp) + *mp = (struct mbuf *)0; + if (so->so_state & SS_ISCONFIRMING && uio->uio_resid) + (*pr->pr_usrreq)(so, PRU_RCVD, (struct mbuf *)0, + (struct mbuf *)0, (struct mbuf *)0); + +restart: + if (error = sblock(&so->so_rcv, SBLOCKWAIT(flags))) + return (error); + s = splnet(); + + m = so->so_rcv.sb_mb; + /* + * If we have less data than requested, block awaiting more + * (subject to any timeout) if: + * 1. the current count is less than the low water mark, or + * 2. MSG_WAITALL is set, and it is possible to do the entire + * receive operation at once if we block (resid <= hiwat), or + * 3. MSG_DONTWAIT is not set. + * If MSG_WAITALL is set but resid is larger than the receive buffer, + * we have to do the receive in sections, and thus risk returning + * a short count if a timeout or signal occurs after we start. + */ + if (m == 0 || ((flags & MSG_DONTWAIT) == 0 && + so->so_rcv.sb_cc < uio->uio_resid) && + (so->so_rcv.sb_cc < so->so_rcv.sb_lowat || + ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) && + m->m_nextpkt == 0 && (pr->pr_flags & PR_ATOMIC) == 0) { +#ifdef DIAGNOSTIC + if (m == 0 && so->so_rcv.sb_cc) + panic("receive 1"); +#endif + if (so->so_error) { + if (m) + goto dontblock; + error = so->so_error; + if ((flags & MSG_PEEK) == 0) + so->so_error = 0; + goto release; + } + if (so->so_state & SS_CANTRCVMORE) { + if (m) + goto dontblock; + else + goto release; + } + for (; m; m = m->m_next) + if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) { + m = so->so_rcv.sb_mb; + goto dontblock; + } + if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && + (so->so_proto->pr_flags & PR_CONNREQUIRED)) { + error = ENOTCONN; + goto release; + } + if (uio->uio_resid == 0) + goto release; + if ((so->so_state & SS_NBIO) || (flags & MSG_DONTWAIT)) { + error = EWOULDBLOCK; + goto release; + } + sbunlock(&so->so_rcv); + error = sbwait(&so->so_rcv); + splx(s); + if (error) + return (error); + goto restart; + } +dontblock: + if (uio->uio_procp) + uio->uio_procp->p_stats->p_ru.ru_msgrcv++; + nextrecord = m->m_nextpkt; + if (pr->pr_flags & PR_ADDR) { +#ifdef DIAGNOSTIC + if (m->m_type != MT_SONAME) + panic("receive 1a"); +#endif + orig_resid = 0; + if (flags & MSG_PEEK) { + if (paddr) + *paddr = m_copy(m, 0, m->m_len); + m = m->m_next; + } else { + sbfree(&so->so_rcv, m); + if (paddr) { + *paddr = m; + so->so_rcv.sb_mb = m->m_next; + m->m_next = 0; + m = so->so_rcv.sb_mb; + } else { + MFREE(m, so->so_rcv.sb_mb); + m = so->so_rcv.sb_mb; + } + } + } + while (m && m->m_type == MT_CONTROL && error == 0) { + if (flags & MSG_PEEK) { + if (controlp) + *controlp = m_copy(m, 0, m->m_len); + m = m->m_next; + } else { + sbfree(&so->so_rcv, m); + if (controlp) { + if (pr->pr_domain->dom_externalize && + mtod(m, struct cmsghdr *)->cmsg_type == + SCM_RIGHTS) + error = (*pr->pr_domain->dom_externalize)(m); + *controlp = m; + so->so_rcv.sb_mb = m->m_next; + m->m_next = 0; + m = so->so_rcv.sb_mb; + } else { + MFREE(m, so->so_rcv.sb_mb); + m = so->so_rcv.sb_mb; + } + } + if (controlp) { + orig_resid = 0; + controlp = &(*controlp)->m_next; + } + } + if (m) { + if ((flags & MSG_PEEK) == 0) + m->m_nextpkt = nextrecord; + type = m->m_type; + if (type == MT_OOBDATA) + flags |= MSG_OOB; + } + moff = 0; + offset = 0; + while (m && uio->uio_resid > 0 && error == 0) { + if (m->m_type == MT_OOBDATA) { + if (type != MT_OOBDATA) + break; + } else if (type == MT_OOBDATA) + break; +#ifdef DIAGNOSTIC + else if (m->m_type != MT_DATA && m->m_type != MT_HEADER) + panic("receive 3"); +#endif + so->so_state &= ~SS_RCVATMARK; + len = uio->uio_resid; + if (so->so_oobmark && len > so->so_oobmark - offset) + len = so->so_oobmark - offset; + if (len > m->m_len - moff) + len = m->m_len - moff; + /* + * If mp is set, just pass back the mbufs. + * Otherwise copy them out via the uio, then free. + * Sockbuf must be consistent here (points to current mbuf, + * it points to next record) when we drop priority; + * we must note any additions to the sockbuf when we + * block interrupts again. + */ + if (mp == 0) { + splx(s); + error = uiomove(mtod(m, caddr_t) + moff, (int)len, uio); + s = splnet(); + } else + uio->uio_resid -= len; + if (len == m->m_len - moff) { + if (m->m_flags & M_EOR) + flags |= MSG_EOR; + if (flags & MSG_PEEK) { + m = m->m_next; + moff = 0; + } else { + nextrecord = m->m_nextpkt; + sbfree(&so->so_rcv, m); + if (mp) { + *mp = m; + mp = &m->m_next; + so->so_rcv.sb_mb = m = m->m_next; + *mp = (struct mbuf *)0; + } else { + MFREE(m, so->so_rcv.sb_mb); + m = so->so_rcv.sb_mb; + } + if (m) + m->m_nextpkt = nextrecord; + } + } else { + if (flags & MSG_PEEK) + moff += len; + else { + if (mp) + *mp = m_copym(m, 0, len, M_WAIT); + m->m_data += len; + m->m_len -= len; + so->so_rcv.sb_cc -= len; + } + } + if (so->so_oobmark) { + if ((flags & MSG_PEEK) == 0) { + so->so_oobmark -= len; + if (so->so_oobmark == 0) { + so->so_state |= SS_RCVATMARK; + break; + } + } else { + offset += len; + if (offset == so->so_oobmark) + break; + } + } + if (flags & MSG_EOR) + break; + /* + * If the MSG_WAITALL flag is set (for non-atomic socket), + * we must not quit until "uio->uio_resid == 0" or an error + * termination. If a signal/timeout occurs, return + * with a short count but without error. + * Keep sockbuf locked against other readers. + */ + while (flags & MSG_WAITALL && m == 0 && uio->uio_resid > 0 && + !sosendallatonce(so) && !nextrecord) { + if (so->so_error || so->so_state & SS_CANTRCVMORE) + break; + error = sbwait(&so->so_rcv); + if (error) { + sbunlock(&so->so_rcv); + splx(s); + return (0); + } + if (m = so->so_rcv.sb_mb) + nextrecord = m->m_nextpkt; + } + } + + if (m && pr->pr_flags & PR_ATOMIC) { + flags |= MSG_TRUNC; + if ((flags & MSG_PEEK) == 0) + (void) sbdroprecord(&so->so_rcv); + } + if ((flags & MSG_PEEK) == 0) { + if (m == 0) + so->so_rcv.sb_mb = nextrecord; + if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) + (*pr->pr_usrreq)(so, PRU_RCVD, (struct mbuf *)0, + (struct mbuf *)(long)flags, (struct mbuf *)0, + (struct mbuf *)0); + } + if (orig_resid == uio->uio_resid && orig_resid && + (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) { + sbunlock(&so->so_rcv); + splx(s); + goto restart; + } + + if (flagsp) + *flagsp |= flags; +release: + sbunlock(&so->so_rcv); + splx(s); + return (error); +} + +int +soshutdown(so, how) + register struct socket *so; + register int how; +{ + register struct protosw *pr = so->so_proto; + + how++; + if (how & FREAD) + sorflush(so); + if (how & FWRITE) + return ((*pr->pr_usrreq)(so, PRU_SHUTDOWN, + (struct mbuf *)0, (struct mbuf *)0, (struct mbuf *)0)); + return (0); +} + +void +sorflush(so) + register struct socket *so; +{ + register struct sockbuf *sb = &so->so_rcv; + register struct protosw *pr = so->so_proto; + register int s; + struct sockbuf asb; + + sb->sb_flags |= SB_NOINTR; + (void) sblock(sb, M_WAITOK); + s = splimp(); + socantrcvmore(so); + sbunlock(sb); + asb = *sb; + bzero((caddr_t)sb, sizeof (*sb)); + splx(s); + if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose) + (*pr->pr_domain->dom_dispose)(asb.sb_mb); + sbrelease(&asb); +} + +int +sosetopt(so, level, optname, m0) + register struct socket *so; + int level, optname; + struct mbuf *m0; +{ + int error = 0; + register struct mbuf *m = m0; + + if (level != SOL_SOCKET) { + if (so->so_proto && so->so_proto->pr_ctloutput) + return ((*so->so_proto->pr_ctloutput) + (PRCO_SETOPT, so, level, optname, &m0)); + error = ENOPROTOOPT; + } else { + switch (optname) { + + case SO_LINGER: + if (m == NULL || m->m_len != sizeof (struct linger)) { + error = EINVAL; + goto bad; + } + so->so_linger = mtod(m, struct linger *)->l_linger; + /* fall thru... */ + + case SO_DEBUG: + case SO_KEEPALIVE: + case SO_DONTROUTE: + case SO_USELOOPBACK: + case SO_BROADCAST: + case SO_REUSEADDR: + case SO_REUSEPORT: + case SO_OOBINLINE: + if (m == NULL || m->m_len < sizeof (int)) { + error = EINVAL; + goto bad; + } + if (*mtod(m, int *)) + so->so_options |= optname; + else + so->so_options &= ~optname; + break; + + case SO_SNDBUF: + case SO_RCVBUF: + case SO_SNDLOWAT: + case SO_RCVLOWAT: + if (m == NULL || m->m_len < sizeof (int)) { + error = EINVAL; + goto bad; + } + switch (optname) { + + case SO_SNDBUF: + case SO_RCVBUF: + if (sbreserve(optname == SO_SNDBUF ? + &so->so_snd : &so->so_rcv, + (u_long) *mtod(m, int *)) == 0) { + error = ENOBUFS; + goto bad; + } + break; + + case SO_SNDLOWAT: + so->so_snd.sb_lowat = *mtod(m, int *); + break; + case SO_RCVLOWAT: + so->so_rcv.sb_lowat = *mtod(m, int *); + break; + } + break; + + case SO_SNDTIMEO: + case SO_RCVTIMEO: + { + struct timeval *tv; + short val; + + if (m == NULL || m->m_len < sizeof (*tv)) { + error = EINVAL; + goto bad; + } + tv = mtod(m, struct timeval *); + if (tv->tv_sec * hz + tv->tv_usec / tick > SHRT_MAX) { + error = EDOM; + goto bad; + } + val = tv->tv_sec * hz + tv->tv_usec / tick; + + switch (optname) { + + case SO_SNDTIMEO: + so->so_snd.sb_timeo = val; + break; + case SO_RCVTIMEO: + so->so_rcv.sb_timeo = val; + break; + } + break; + } + + default: + error = ENOPROTOOPT; + break; + } + if (error == 0 && so->so_proto && so->so_proto->pr_ctloutput) { + (void) ((*so->so_proto->pr_ctloutput) + (PRCO_SETOPT, so, level, optname, &m0)); + m = NULL; /* freed by protocol */ + } + } +bad: + if (m) + (void) m_free(m); + return (error); +} + +int +sogetopt(so, level, optname, mp) + register struct socket *so; + int level, optname; + struct mbuf **mp; +{ + register struct mbuf *m; + + if (level != SOL_SOCKET) { + if (so->so_proto && so->so_proto->pr_ctloutput) { + return ((*so->so_proto->pr_ctloutput) + (PRCO_GETOPT, so, level, optname, mp)); + } else + return (ENOPROTOOPT); + } else { + m = m_get(M_WAIT, MT_SOOPTS); + m->m_len = sizeof (int); + + switch (optname) { + + case SO_LINGER: + m->m_len = sizeof (struct linger); + mtod(m, struct linger *)->l_onoff = + so->so_options & SO_LINGER; + mtod(m, struct linger *)->l_linger = so->so_linger; + break; + + case SO_USELOOPBACK: + case SO_DONTROUTE: + case SO_DEBUG: + case SO_KEEPALIVE: + case SO_REUSEADDR: + case SO_REUSEPORT: + case SO_BROADCAST: + case SO_OOBINLINE: + *mtod(m, int *) = so->so_options & optname; + break; + + case SO_TYPE: + *mtod(m, int *) = so->so_type; + break; + + case SO_ERROR: + *mtod(m, int *) = so->so_error; + so->so_error = 0; + break; + + case SO_SNDBUF: + *mtod(m, int *) = so->so_snd.sb_hiwat; + break; + + case SO_RCVBUF: + *mtod(m, int *) = so->so_rcv.sb_hiwat; + break; + + case SO_SNDLOWAT: + *mtod(m, int *) = so->so_snd.sb_lowat; + break; + + case SO_RCVLOWAT: + *mtod(m, int *) = so->so_rcv.sb_lowat; + break; + + case SO_SNDTIMEO: + case SO_RCVTIMEO: + { + int val = (optname == SO_SNDTIMEO ? + so->so_snd.sb_timeo : so->so_rcv.sb_timeo); + + m->m_len = sizeof(struct timeval); + mtod(m, struct timeval *)->tv_sec = val / hz; + mtod(m, struct timeval *)->tv_usec = + (val % hz) * tick; + break; + } + + default: + (void)m_free(m); + return (ENOPROTOOPT); + } + *mp = m; + return (0); + } +} + +void +sohasoutofband(so) + register struct socket *so; +{ + struct proc *p; + + if (so->so_pgid < 0) + gsignal(-so->so_pgid, SIGURG); + else if (so->so_pgid > 0 && (p = pfind(so->so_pgid)) != 0) + psignal(p, SIGURG); + selwakeup(&so->so_rcv.sb_sel); +} diff --git a/sys/kern/uipc_socket2.c b/sys/kern/uipc_socket2.c new file mode 100644 index 000000000000..865108aaee3f --- /dev/null +++ b/sys/kern/uipc_socket2.c @@ -0,0 +1,779 @@ +/* + * Copyright (c) 1982, 1986, 1988, 1990, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)uipc_socket2.c 8.2 (Berkeley) 2/14/95 + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/proc.h> +#include <sys/file.h> +#include <sys/buf.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/protosw.h> +#include <sys/socket.h> +#include <sys/socketvar.h> + +/* + * Primitive routines for operating on sockets and socket buffers + */ + +/* strings for sleep message: */ +char netio[] = "netio"; +char netcon[] = "netcon"; +char netcls[] = "netcls"; + +u_long sb_max = SB_MAX; /* patchable */ + +/* + * Procedures to manipulate state flags of socket + * and do appropriate wakeups. Normal sequence from the + * active (originating) side is that soisconnecting() is + * called during processing of connect() call, + * resulting in an eventual call to soisconnected() if/when the + * connection is established. When the connection is torn down + * soisdisconnecting() is called during processing of disconnect() call, + * and soisdisconnected() is called when the connection to the peer + * is totally severed. The semantics of these routines are such that + * connectionless protocols can call soisconnected() and soisdisconnected() + * only, bypassing the in-progress calls when setting up a ``connection'' + * takes no time. + * + * From the passive side, a socket is created with + * two queues of sockets: so_q0 for connections in progress + * and so_q for connections already made and awaiting user acceptance. + * As a protocol is preparing incoming connections, it creates a socket + * structure queued on so_q0 by calling sonewconn(). When the connection + * is established, soisconnected() is called, and transfers the + * socket structure to so_q, making it available to accept(). + * + * If a socket is closed with sockets on either + * so_q0 or so_q, these sockets are dropped. + * + * If higher level protocols are implemented in + * the kernel, the wakeups done here will sometimes + * cause software-interrupt process scheduling. + */ + +void +soisconnecting(so) + register struct socket *so; +{ + + so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING); + so->so_state |= SS_ISCONNECTING; +} + +void +soisconnected(so) + register struct socket *so; +{ + register struct socket *head = so->so_head; + + so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING); + so->so_state |= SS_ISCONNECTED; + if (head && soqremque(so, 0)) { + soqinsque(head, so, 1); + sorwakeup(head); + wakeup((caddr_t)&head->so_timeo); + } else { + wakeup((caddr_t)&so->so_timeo); + sorwakeup(so); + sowwakeup(so); + } +} + +void +soisdisconnecting(so) + register struct socket *so; +{ + + so->so_state &= ~SS_ISCONNECTING; + so->so_state |= (SS_ISDISCONNECTING|SS_CANTRCVMORE|SS_CANTSENDMORE); + wakeup((caddr_t)&so->so_timeo); + sowwakeup(so); + sorwakeup(so); +} + +void +soisdisconnected(so) + register struct socket *so; +{ + + so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING); + so->so_state |= (SS_CANTRCVMORE|SS_CANTSENDMORE); + wakeup((caddr_t)&so->so_timeo); + sowwakeup(so); + sorwakeup(so); +} + +/* + * When an attempt at a new connection is noted on a socket + * which accepts connections, sonewconn is called. If the + * connection is possible (subject to space constraints, etc.) + * then we allocate a new structure, propoerly linked into the + * data structure of the original socket, and return this. + * Connstatus may be 0, or SO_ISCONFIRMING, or SO_ISCONNECTED. + * + * Currently, sonewconn() is defined as sonewconn1() in socketvar.h + * to catch calls that are missing the (new) second parameter. + */ +struct socket * +sonewconn1(head, connstatus) + register struct socket *head; + int connstatus; +{ + register struct socket *so; + int soqueue = connstatus ? 1 : 0; + + if (head->so_qlen + head->so_q0len > 3 * head->so_qlimit / 2) + return ((struct socket *)0); + MALLOC(so, struct socket *, sizeof(*so), M_SOCKET, M_DONTWAIT); + if (so == NULL) + return ((struct socket *)0); + bzero((caddr_t)so, sizeof(*so)); + so->so_type = head->so_type; + so->so_options = head->so_options &~ SO_ACCEPTCONN; + so->so_linger = head->so_linger; + so->so_state = head->so_state | SS_NOFDREF; + so->so_proto = head->so_proto; + so->so_timeo = head->so_timeo; + so->so_pgid = head->so_pgid; + (void) soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat); + soqinsque(head, so, soqueue); + if ((*so->so_proto->pr_usrreq)(so, PRU_ATTACH, + (struct mbuf *)0, (struct mbuf *)0, (struct mbuf *)0)) { + (void) soqremque(so, soqueue); + (void) free((caddr_t)so, M_SOCKET); + return ((struct socket *)0); + } + if (connstatus) { + sorwakeup(head); + wakeup((caddr_t)&head->so_timeo); + so->so_state |= connstatus; + } + return (so); +} + +void +soqinsque(head, so, q) + register struct socket *head, *so; + int q; +{ + + register struct socket **prev; + so->so_head = head; + if (q == 0) { + head->so_q0len++; + so->so_q0 = 0; + for (prev = &(head->so_q0); *prev; ) + prev = &((*prev)->so_q0); + } else { + head->so_qlen++; + so->so_q = 0; + for (prev = &(head->so_q); *prev; ) + prev = &((*prev)->so_q); + } + *prev = so; +} + +int +soqremque(so, q) + register struct socket *so; + int q; +{ + register struct socket *head, *prev, *next; + + head = so->so_head; + prev = head; + for (;;) { + next = q ? prev->so_q : prev->so_q0; + if (next == so) + break; + if (next == 0) + return (0); + prev = next; + } + if (q == 0) { + prev->so_q0 = next->so_q0; + head->so_q0len--; + } else { + prev->so_q = next->so_q; + head->so_qlen--; + } + next->so_q0 = next->so_q = 0; + next->so_head = 0; + return (1); +} + +/* + * Socantsendmore indicates that no more data will be sent on the + * socket; it would normally be applied to a socket when the user + * informs the system that no more data is to be sent, by the protocol + * code (in case PRU_SHUTDOWN). Socantrcvmore indicates that no more data + * will be received, and will normally be applied to the socket by a + * protocol when it detects that the peer will send no more data. + * Data queued for reading in the socket may yet be read. + */ + +void +socantsendmore(so) + struct socket *so; +{ + + so->so_state |= SS_CANTSENDMORE; + sowwakeup(so); +} + +void +socantrcvmore(so) + struct socket *so; +{ + + so->so_state |= SS_CANTRCVMORE; + sorwakeup(so); +} + +/* + * Wait for data to arrive at/drain from a socket buffer. + */ +int +sbwait(sb) + struct sockbuf *sb; +{ + + sb->sb_flags |= SB_WAIT; + return (tsleep((caddr_t)&sb->sb_cc, + (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK | PCATCH, netio, + sb->sb_timeo)); +} + +/* + * Lock a sockbuf already known to be locked; + * return any error returned from sleep (EINTR). + */ +int +sb_lock(sb) + register struct sockbuf *sb; +{ + int error; + + while (sb->sb_flags & SB_LOCK) { + sb->sb_flags |= SB_WANT; + if (error = tsleep((caddr_t)&sb->sb_flags, + (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK|PCATCH, + netio, 0)) + return (error); + } + sb->sb_flags |= SB_LOCK; + return (0); +} + +/* + * Wakeup processes waiting on a socket buffer. + * Do asynchronous notification via SIGIO + * if the socket has the SS_ASYNC flag set. + */ +void +sowakeup(so, sb) + register struct socket *so; + register struct sockbuf *sb; +{ + struct proc *p; + + selwakeup(&sb->sb_sel); + sb->sb_flags &= ~SB_SEL; + if (sb->sb_flags & SB_WAIT) { + sb->sb_flags &= ~SB_WAIT; + wakeup((caddr_t)&sb->sb_cc); + } + if (so->so_state & SS_ASYNC) { + if (so->so_pgid < 0) + gsignal(-so->so_pgid, SIGIO); + else if (so->so_pgid > 0 && (p = pfind(so->so_pgid)) != 0) + psignal(p, SIGIO); + } +} + +/* + * Socket buffer (struct sockbuf) utility routines. + * + * Each socket contains two socket buffers: one for sending data and + * one for receiving data. Each buffer contains a queue of mbufs, + * information about the number of mbufs and amount of data in the + * queue, and other fields allowing select() statements and notification + * on data availability to be implemented. + * + * Data stored in a socket buffer is maintained as a list of records. + * Each record is a list of mbufs chained together with the m_next + * field. Records are chained together with the m_nextpkt field. The upper + * level routine soreceive() expects the following conventions to be + * observed when placing information in the receive buffer: + * + * 1. If the protocol requires each message be preceded by the sender's + * name, then a record containing that name must be present before + * any associated data (mbuf's must be of type MT_SONAME). + * 2. If the protocol supports the exchange of ``access rights'' (really + * just additional data associated with the message), and there are + * ``rights'' to be received, then a record containing this data + * should be present (mbuf's must be of type MT_RIGHTS). + * 3. If a name or rights record exists, then it must be followed by + * a data record, perhaps of zero length. + * + * Before using a new socket structure it is first necessary to reserve + * buffer space to the socket, by calling sbreserve(). This should commit + * some of the available buffer space in the system buffer pool for the + * socket (currently, it does nothing but enforce limits). The space + * should be released by calling sbrelease() when the socket is destroyed. + */ + +int +soreserve(so, sndcc, rcvcc) + register struct socket *so; + u_long sndcc, rcvcc; +{ + + if (sbreserve(&so->so_snd, sndcc) == 0) + goto bad; + if (sbreserve(&so->so_rcv, rcvcc) == 0) + goto bad2; + if (so->so_rcv.sb_lowat == 0) + so->so_rcv.sb_lowat = 1; + if (so->so_snd.sb_lowat == 0) + so->so_snd.sb_lowat = MCLBYTES; + if (so->so_snd.sb_lowat > so->so_snd.sb_hiwat) + so->so_snd.sb_lowat = so->so_snd.sb_hiwat; + return (0); +bad2: + sbrelease(&so->so_snd); +bad: + return (ENOBUFS); +} + +/* + * Allot mbufs to a sockbuf. + * Attempt to scale mbmax so that mbcnt doesn't become limiting + * if buffering efficiency is near the normal case. + */ +int +sbreserve(sb, cc) + struct sockbuf *sb; + u_long cc; +{ + + if (cc > sb_max * MCLBYTES / (MSIZE + MCLBYTES)) + return (0); + sb->sb_hiwat = cc; + sb->sb_mbmax = min(cc * 2, sb_max); + if (sb->sb_lowat > sb->sb_hiwat) + sb->sb_lowat = sb->sb_hiwat; + return (1); +} + +/* + * Free mbufs held by a socket, and reserved mbuf space. + */ +void +sbrelease(sb) + struct sockbuf *sb; +{ + + sbflush(sb); + sb->sb_hiwat = sb->sb_mbmax = 0; +} + +/* + * Routines to add and remove + * data from an mbuf queue. + * + * The routines sbappend() or sbappendrecord() are normally called to + * append new mbufs to a socket buffer, after checking that adequate + * space is available, comparing the function sbspace() with the amount + * of data to be added. sbappendrecord() differs from sbappend() in + * that data supplied is treated as the beginning of a new record. + * To place a sender's address, optional access rights, and data in a + * socket receive buffer, sbappendaddr() should be used. To place + * access rights and data in a socket receive buffer, sbappendrights() + * should be used. In either case, the new data begins a new record. + * Note that unlike sbappend() and sbappendrecord(), these routines check + * for the caller that there will be enough space to store the data. + * Each fails if there is not enough space, or if it cannot find mbufs + * to store additional information in. + * + * Reliable protocols may use the socket send buffer to hold data + * awaiting acknowledgement. Data is normally copied from a socket + * send buffer in a protocol with m_copy for output to a peer, + * and then removing the data from the socket buffer with sbdrop() + * or sbdroprecord() when the data is acknowledged by the peer. + */ + +/* + * Append mbuf chain m to the last record in the + * socket buffer sb. The additional space associated + * the mbuf chain is recorded in sb. Empty mbufs are + * discarded and mbufs are compacted where possible. + */ +void +sbappend(sb, m) + struct sockbuf *sb; + struct mbuf *m; +{ + register struct mbuf *n; + + if (m == 0) + return; + if (n = sb->sb_mb) { + while (n->m_nextpkt) + n = n->m_nextpkt; + do { + if (n->m_flags & M_EOR) { + sbappendrecord(sb, m); /* XXXXXX!!!! */ + return; + } + } while (n->m_next && (n = n->m_next)); + } + sbcompress(sb, m, n); +} + +#ifdef SOCKBUF_DEBUG +void +sbcheck(sb) + register struct sockbuf *sb; +{ + register struct mbuf *m; + register int len = 0, mbcnt = 0; + + for (m = sb->sb_mb; m; m = m->m_next) { + len += m->m_len; + mbcnt += MSIZE; + if (m->m_flags & M_EXT) + mbcnt += m->m_ext.ext_size; + if (m->m_nextpkt) + panic("sbcheck nextpkt"); + } + if (len != sb->sb_cc || mbcnt != sb->sb_mbcnt) { + printf("cc %d != %d || mbcnt %d != %d\n", len, sb->sb_cc, + mbcnt, sb->sb_mbcnt); + panic("sbcheck"); + } +} +#endif + +/* + * As above, except the mbuf chain + * begins a new record. + */ +void +sbappendrecord(sb, m0) + register struct sockbuf *sb; + register struct mbuf *m0; +{ + register struct mbuf *m; + + if (m0 == 0) + return; + if (m = sb->sb_mb) + while (m->m_nextpkt) + m = m->m_nextpkt; + /* + * Put the first mbuf on the queue. + * Note this permits zero length records. + */ + sballoc(sb, m0); + if (m) + m->m_nextpkt = m0; + else + sb->sb_mb = m0; + m = m0->m_next; + m0->m_next = 0; + if (m && (m0->m_flags & M_EOR)) { + m0->m_flags &= ~M_EOR; + m->m_flags |= M_EOR; + } + sbcompress(sb, m, m0); +} + +/* + * As above except that OOB data + * is inserted at the beginning of the sockbuf, + * but after any other OOB data. + */ +void +sbinsertoob(sb, m0) + register struct sockbuf *sb; + register struct mbuf *m0; +{ + register struct mbuf *m; + register struct mbuf **mp; + + if (m0 == 0) + return; + for (mp = &sb->sb_mb; m = *mp; mp = &((*mp)->m_nextpkt)) { + again: + switch (m->m_type) { + + case MT_OOBDATA: + continue; /* WANT next train */ + + case MT_CONTROL: + if (m = m->m_next) + goto again; /* inspect THIS train further */ + } + break; + } + /* + * Put the first mbuf on the queue. + * Note this permits zero length records. + */ + sballoc(sb, m0); + m0->m_nextpkt = *mp; + *mp = m0; + m = m0->m_next; + m0->m_next = 0; + if (m && (m0->m_flags & M_EOR)) { + m0->m_flags &= ~M_EOR; + m->m_flags |= M_EOR; + } + sbcompress(sb, m, m0); +} + +/* + * Append address and data, and optionally, control (ancillary) data + * to the receive queue of a socket. If present, + * m0 must include a packet header with total length. + * Returns 0 if no space in sockbuf or insufficient mbufs. + */ +int +sbappendaddr(sb, asa, m0, control) + register struct sockbuf *sb; + struct sockaddr *asa; + struct mbuf *m0, *control; +{ + register struct mbuf *m, *n; + int space = asa->sa_len; + +if (m0 && (m0->m_flags & M_PKTHDR) == 0) +panic("sbappendaddr"); + if (m0) + space += m0->m_pkthdr.len; + for (n = control; n; n = n->m_next) { + space += n->m_len; + if (n->m_next == 0) /* keep pointer to last control buf */ + break; + } + if (space > sbspace(sb)) + return (0); + if (asa->sa_len > MLEN) + return (0); + MGET(m, M_DONTWAIT, MT_SONAME); + if (m == 0) + return (0); + m->m_len = asa->sa_len; + bcopy((caddr_t)asa, mtod(m, caddr_t), asa->sa_len); + if (n) + n->m_next = m0; /* concatenate data to control */ + else + control = m0; + m->m_next = control; + for (n = m; n; n = n->m_next) + sballoc(sb, n); + if (n = sb->sb_mb) { + while (n->m_nextpkt) + n = n->m_nextpkt; + n->m_nextpkt = m; + } else + sb->sb_mb = m; + return (1); +} + +int +sbappendcontrol(sb, m0, control) + struct sockbuf *sb; + struct mbuf *m0, *control; +{ + register struct mbuf *m, *n; + int space = 0; + + if (control == 0) + panic("sbappendcontrol"); + for (m = control; ; m = m->m_next) { + space += m->m_len; + if (m->m_next == 0) + break; + } + n = m; /* save pointer to last control buffer */ + for (m = m0; m; m = m->m_next) + space += m->m_len; + if (space > sbspace(sb)) + return (0); + n->m_next = m0; /* concatenate data to control */ + for (m = control; m; m = m->m_next) + sballoc(sb, m); + if (n = sb->sb_mb) { + while (n->m_nextpkt) + n = n->m_nextpkt; + n->m_nextpkt = control; + } else + sb->sb_mb = control; + return (1); +} + +/* + * Compress mbuf chain m into the socket + * buffer sb following mbuf n. If n + * is null, the buffer is presumed empty. + */ +void +sbcompress(sb, m, n) + register struct sockbuf *sb; + register struct mbuf *m, *n; +{ + register int eor = 0; + register struct mbuf *o; + + while (m) { + eor |= m->m_flags & M_EOR; + if (m->m_len == 0 && + (eor == 0 || + (((o = m->m_next) || (o = n)) && + o->m_type == m->m_type))) { + m = m_free(m); + continue; + } + if (n && (n->m_flags & (M_EXT | M_EOR)) == 0 && + (n->m_data + n->m_len + m->m_len) < &n->m_dat[MLEN] && + n->m_type == m->m_type) { + bcopy(mtod(m, caddr_t), mtod(n, caddr_t) + n->m_len, + (unsigned)m->m_len); + n->m_len += m->m_len; + sb->sb_cc += m->m_len; + m = m_free(m); + continue; + } + if (n) + n->m_next = m; + else + sb->sb_mb = m; + sballoc(sb, m); + n = m; + m->m_flags &= ~M_EOR; + m = m->m_next; + n->m_next = 0; + } + if (eor) { + if (n) + n->m_flags |= eor; + else + printf("semi-panic: sbcompress\n"); + } +} + +/* + * Free all mbufs in a sockbuf. + * Check that all resources are reclaimed. + */ +void +sbflush(sb) + register struct sockbuf *sb; +{ + + if (sb->sb_flags & SB_LOCK) + panic("sbflush"); + while (sb->sb_mbcnt) + sbdrop(sb, (int)sb->sb_cc); + if (sb->sb_cc || sb->sb_mb) + panic("sbflush 2"); +} + +/* + * Drop data from (the front of) a sockbuf. + */ +void +sbdrop(sb, len) + register struct sockbuf *sb; + register int len; +{ + register struct mbuf *m, *mn; + struct mbuf *next; + + next = (m = sb->sb_mb) ? m->m_nextpkt : 0; + while (len > 0) { + if (m == 0) { + if (next == 0) + panic("sbdrop"); + m = next; + next = m->m_nextpkt; + continue; + } + if (m->m_len > len) { + m->m_len -= len; + m->m_data += len; + sb->sb_cc -= len; + break; + } + len -= m->m_len; + sbfree(sb, m); + MFREE(m, mn); + m = mn; + } + while (m && m->m_len == 0) { + sbfree(sb, m); + MFREE(m, mn); + m = mn; + } + if (m) { + sb->sb_mb = m; + m->m_nextpkt = next; + } else + sb->sb_mb = next; +} + +/* + * Drop a record off the front of a sockbuf + * and move the next record to the front. + */ +void +sbdroprecord(sb) + register struct sockbuf *sb; +{ + register struct mbuf *m, *mn; + + m = sb->sb_mb; + if (m) { + sb->sb_mb = m->m_nextpkt; + do { + sbfree(sb, m); + MFREE(m, mn); + } while (m = mn); + } +} diff --git a/sys/kern/uipc_syscalls.c b/sys/kern/uipc_syscalls.c new file mode 100644 index 000000000000..800434c7f3ca --- /dev/null +++ b/sys/kern/uipc_syscalls.c @@ -0,0 +1,1263 @@ +/* + * Copyright (c) 1982, 1986, 1989, 1990, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)uipc_syscalls.c 8.6 (Berkeley) 2/14/95 + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/filedesc.h> +#include <sys/proc.h> +#include <sys/file.h> +#include <sys/buf.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/protosw.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#ifdef KTRACE +#include <sys/ktrace.h> +#endif + +#include <sys/mount.h> +#include <sys/syscallargs.h> + +/* + * System call interface to the socket abstraction. + */ +#if defined(COMPAT_43) || defined(COMPAT_SUNOS) +#define COMPAT_OLDSOCK +#endif + +extern struct fileops socketops; + +int +socket(p, uap, retval) + struct proc *p; + register struct socket_args /* { + syscallarg(int) domain; + syscallarg(int) type; + syscallarg(int) protocol; + } */ *uap; + register_t *retval; +{ + struct filedesc *fdp = p->p_fd; + struct socket *so; + struct file *fp; + int fd, error; + + if (error = falloc(p, &fp, &fd)) + return (error); + fp->f_flag = FREAD|FWRITE; + fp->f_type = DTYPE_SOCKET; + fp->f_ops = &socketops; + if (error = socreate(SCARG(uap, domain), &so, SCARG(uap, type), + SCARG(uap, protocol))) { + fdp->fd_ofiles[fd] = 0; + ffree(fp); + } else { + fp->f_data = (caddr_t)so; + *retval = fd; + } + return (error); +} + +/* ARGSUSED */ +int +bind(p, uap, retval) + struct proc *p; + register struct bind_args /* { + syscallarg(int) s; + syscallarg(caddr_t) name; + syscallarg(int) namelen; + } */ *uap; + register_t *retval; +{ + struct file *fp; + struct mbuf *nam; + int error; + + if (error = getsock(p->p_fd, SCARG(uap, s), &fp)) + return (error); + if (error = sockargs(&nam, SCARG(uap, name), SCARG(uap, namelen), + MT_SONAME)) + return (error); + error = sobind((struct socket *)fp->f_data, nam); + m_freem(nam); + return (error); +} + +/* ARGSUSED */ +int +listen(p, uap, retval) + struct proc *p; + register struct listen_args /* { + syscallarg(int) s; + syscallarg(int) backlog; + } */ *uap; + register_t *retval; +{ + struct file *fp; + int error; + + if (error = getsock(p->p_fd, SCARG(uap, s), &fp)) + return (error); + return (solisten((struct socket *)fp->f_data, SCARG(uap, backlog))); +} + +#ifdef COMPAT_OLDSOCK +int +accept(p, uap, retval) + struct proc *p; + struct accept_args /* { + syscallarg(int) s; + syscallarg(caddr_t) name; + syscallarg(int *) anamelen; + } */ *uap; + register_t *retval; +{ + + return (accept1(p, uap, retval, 0)); +} + +int +compat_43_accept(p, uap, retval) + struct proc *p; + struct accept_args /* { + syscallarg(int) s; + syscallarg(caddr_t) name; + syscallarg(int *) anamelen; + } */ *uap; + register_t *retval; +{ + + return (accept1(p, uap, retval, 1)); +} +#else /* COMPAT_OLDSOCK */ + +#define accept1 accept +#endif + +int +accept1(p, uap, retval, compat_43) + struct proc *p; + register struct accept_args /* { + syscallarg(int) s; + syscallarg(caddr_t) name; + syscallarg(int *) anamelen; + } */ *uap; + register_t *retval; + int compat_43; +{ + struct file *fp; + struct mbuf *nam; + int namelen, error, s, tmpfd; + register struct socket *so; + + if (SCARG(uap, name) && (error = copyin((caddr_t)SCARG(uap, anamelen), + (caddr_t)&namelen, sizeof (namelen)))) + return (error); + if (error = getsock(p->p_fd, SCARG(uap, s), &fp)) + return (error); + s = splnet(); + so = (struct socket *)fp->f_data; + if ((so->so_options & SO_ACCEPTCONN) == 0) { + splx(s); + return (EINVAL); + } + if ((so->so_state & SS_NBIO) && so->so_qlen == 0) { + splx(s); + return (EWOULDBLOCK); + } + while (so->so_qlen == 0 && so->so_error == 0) { + if (so->so_state & SS_CANTRCVMORE) { + so->so_error = ECONNABORTED; + break; + } + if (error = tsleep((caddr_t)&so->so_timeo, PSOCK | PCATCH, + netcon, 0)) { + splx(s); + return (error); + } + } + if (so->so_error) { + error = so->so_error; + so->so_error = 0; + splx(s); + return (error); + } + if (error = falloc(p, &fp, &tmpfd)) { + splx(s); + return (error); + } + *retval = tmpfd; + { struct socket *aso = so->so_q; + if (soqremque(aso, 1) == 0) + panic("accept"); + so = aso; + } + fp->f_type = DTYPE_SOCKET; + fp->f_flag = FREAD|FWRITE; + fp->f_ops = &socketops; + fp->f_data = (caddr_t)so; + nam = m_get(M_WAIT, MT_SONAME); + (void) soaccept(so, nam); + if (SCARG(uap, name)) { +#ifdef COMPAT_OLDSOCK + if (compat_43) + mtod(nam, struct osockaddr *)->sa_family = + mtod(nam, struct sockaddr *)->sa_family; +#endif + if (namelen > nam->m_len) + namelen = nam->m_len; + /* SHOULD COPY OUT A CHAIN HERE */ + if ((error = copyout(mtod(nam, caddr_t), + (caddr_t)SCARG(uap, name), (u_int)namelen)) == 0) + error = copyout((caddr_t)&namelen, + (caddr_t)SCARG(uap, anamelen), + sizeof (*SCARG(uap, anamelen))); + } + m_freem(nam); + splx(s); + return (error); +} + +/* ARGSUSED */ +int +connect(p, uap, retval) + struct proc *p; + register struct connect_args /* { + syscallarg(int) s; + syscallarg(caddr_t) name; + syscallarg(int) namelen; + } */ *uap; + register_t *retval; +{ + struct file *fp; + register struct socket *so; + struct mbuf *nam; + int error, s; + + if (error = getsock(p->p_fd, SCARG(uap, s), &fp)) + return (error); + so = (struct socket *)fp->f_data; + if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) + return (EALREADY); + if (error = sockargs(&nam, SCARG(uap, name), SCARG(uap, namelen), + MT_SONAME)) + return (error); + error = soconnect(so, nam); + if (error) + goto bad; + if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) { + m_freem(nam); + return (EINPROGRESS); + } + s = splnet(); + while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) + if (error = tsleep((caddr_t)&so->so_timeo, PSOCK | PCATCH, + netcon, 0)) + break; + if (error == 0) { + error = so->so_error; + so->so_error = 0; + } + splx(s); +bad: + so->so_state &= ~SS_ISCONNECTING; + m_freem(nam); + if (error == ERESTART) + error = EINTR; + return (error); +} + +int +socketpair(p, uap, retval) + struct proc *p; + register struct socketpair_args /* { + syscallarg(int) domain; + syscallarg(int) type; + syscallarg(int) protocol; + syscallarg(int *) rsv; + } */ *uap; + register_t *retval; +{ + register struct filedesc *fdp = p->p_fd; + struct file *fp1, *fp2; + struct socket *so1, *so2; + int fd, error, sv[2]; + + if (error = socreate(SCARG(uap, domain), &so1, SCARG(uap, type), + SCARG(uap, protocol))) + return (error); + if (error = socreate(SCARG(uap, domain), &so2, SCARG(uap, type), + SCARG(uap, protocol))) + goto free1; + if (error = falloc(p, &fp1, &fd)) + goto free2; + sv[0] = fd; + fp1->f_flag = FREAD|FWRITE; + fp1->f_type = DTYPE_SOCKET; + fp1->f_ops = &socketops; + fp1->f_data = (caddr_t)so1; + if (error = falloc(p, &fp2, &fd)) + goto free3; + fp2->f_flag = FREAD|FWRITE; + fp2->f_type = DTYPE_SOCKET; + fp2->f_ops = &socketops; + fp2->f_data = (caddr_t)so2; + sv[1] = fd; + if (error = soconnect2(so1, so2)) + goto free4; + if (SCARG(uap, type) == SOCK_DGRAM) { + /* + * Datagram socket connection is asymmetric. + */ + if (error = soconnect2(so2, so1)) + goto free4; + } + error = copyout((caddr_t)sv, (caddr_t)SCARG(uap, rsv), + 2 * sizeof (int)); + retval[0] = sv[0]; /* XXX ??? */ + retval[1] = sv[1]; /* XXX ??? */ + return (error); +free4: + ffree(fp2); + fdp->fd_ofiles[sv[1]] = 0; +free3: + ffree(fp1); + fdp->fd_ofiles[sv[0]] = 0; +free2: + (void)soclose(so2); +free1: + (void)soclose(so1); + return (error); +} + +int +sendto(p, uap, retval) + struct proc *p; + register struct sendto_args /* { + syscallarg(int) s; + syscallarg(caddr_t) buf; + syscallarg(size_t) len; + syscallarg(int) flags; + syscallarg(caddr_t) to; + syscallarg(int) tolen; + } */ *uap; + register_t *retval; +{ + struct msghdr msg; + struct iovec aiov; + + msg.msg_name = SCARG(uap, to); + msg.msg_namelen = SCARG(uap, tolen); + msg.msg_iov = &aiov; + msg.msg_iovlen = 1; + msg.msg_control = 0; +#ifdef COMPAT_OLDSOCK + msg.msg_flags = 0; +#endif + aiov.iov_base = SCARG(uap, buf); + aiov.iov_len = SCARG(uap, len); + return (sendit(p, SCARG(uap, s), &msg, SCARG(uap, flags), retval)); +} + +#ifdef COMPAT_OLDSOCK +int +compat_43_send(p, uap, retval) + struct proc *p; + register struct compat_43_send_args /* { + syscallarg(int) s; + syscallarg(caddr_t) buf; + syscallarg(int) len; + syscallarg(int) flags; + } */ *uap; + register_t *retval; +{ + struct msghdr msg; + struct iovec aiov; + + msg.msg_name = 0; + msg.msg_namelen = 0; + msg.msg_iov = &aiov; + msg.msg_iovlen = 1; + aiov.iov_base = SCARG(uap, buf); + aiov.iov_len = SCARG(uap, len); + msg.msg_control = 0; + msg.msg_flags = 0; + return (sendit(p, SCARG(uap, s), &msg, SCARG(uap, flags), retval)); +} + +#define MSG_COMPAT 0x8000 +int +compat_43_sendmsg(p, uap, retval) + struct proc *p; + register struct compat_43_sendmsg_args /* { + syscallarg(int) s; + syscallarg(caddr_t) msg; + syscallarg(int) flags; + } */ *uap; + register_t *retval; +{ + struct msghdr msg; + struct iovec aiov[UIO_SMALLIOV], *iov; + int error; + + if (error = copyin(SCARG(uap, msg), (caddr_t)&msg, + sizeof (struct omsghdr))) + return (error); + if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) { + if ((u_int)msg.msg_iovlen >= UIO_MAXIOV) + return (EMSGSIZE); + MALLOC(iov, struct iovec *, + sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV, + M_WAITOK); + } else + iov = aiov; + if (error = copyin((caddr_t)msg.msg_iov, (caddr_t)iov, + (unsigned)(msg.msg_iovlen * sizeof (struct iovec)))) + goto done; + msg.msg_flags = MSG_COMPAT; + msg.msg_iov = iov; + error = sendit(p, SCARG(uap, s), &msg, SCARG(uap, flags), retval); +done: + if (iov != aiov) + FREE(iov, M_IOV); + return (error); +} +#endif + +int +sendmsg(p, uap, retval) + struct proc *p; + register struct sendmsg_args /* { + syscallarg(int) s; + syscallarg(caddr_t) msg; + syscallarg(int) flags; + } */ *uap; + register_t *retval; +{ + struct msghdr msg; + struct iovec aiov[UIO_SMALLIOV], *iov; + int error; + + if (error = copyin(SCARG(uap, msg), (caddr_t)&msg, sizeof (msg))) + return (error); + if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) { + if ((u_int)msg.msg_iovlen >= UIO_MAXIOV) + return (EMSGSIZE); + MALLOC(iov, struct iovec *, + sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV, + M_WAITOK); + } else + iov = aiov; + if (msg.msg_iovlen && + (error = copyin((caddr_t)msg.msg_iov, (caddr_t)iov, + (unsigned)(msg.msg_iovlen * sizeof (struct iovec))))) + goto done; + msg.msg_iov = iov; +#ifdef COMPAT_OLDSOCK + msg.msg_flags = 0; +#endif + error = sendit(p, SCARG(uap, s), &msg, SCARG(uap, flags), retval); +done: + if (iov != aiov) + FREE(iov, M_IOV); + return (error); +} + +int +sendit(p, s, mp, flags, retsize) + register struct proc *p; + int s; + register struct msghdr *mp; + int flags; + register_t *retsize; +{ + struct file *fp; + struct uio auio; + register struct iovec *iov; + register int i; + struct mbuf *to, *control; + int len, error; +#ifdef KTRACE + struct iovec *ktriov = NULL; +#endif + + if (error = getsock(p->p_fd, s, &fp)) + return (error); + auio.uio_iov = mp->msg_iov; + auio.uio_iovcnt = mp->msg_iovlen; + auio.uio_segflg = UIO_USERSPACE; + auio.uio_rw = UIO_WRITE; + auio.uio_procp = p; + auio.uio_offset = 0; /* XXX */ + auio.uio_resid = 0; + iov = mp->msg_iov; + for (i = 0; i < mp->msg_iovlen; i++, iov++) { + if (auio.uio_resid + iov->iov_len < auio.uio_resid) + return (EINVAL); + auio.uio_resid += iov->iov_len; + } + if (mp->msg_name) { + if (error = sockargs(&to, mp->msg_name, mp->msg_namelen, + MT_SONAME)) + return (error); + } else + to = 0; + if (mp->msg_control) { + if (mp->msg_controllen < sizeof(struct cmsghdr) +#ifdef COMPAT_OLDSOCK + && mp->msg_flags != MSG_COMPAT +#endif + ) { + error = EINVAL; + goto bad; + } + if (error = sockargs(&control, mp->msg_control, + mp->msg_controllen, MT_CONTROL)) + goto bad; +#ifdef COMPAT_OLDSOCK + if (mp->msg_flags == MSG_COMPAT) { + register struct cmsghdr *cm; + + M_PREPEND(control, sizeof(*cm), M_WAIT); + if (control == 0) { + error = ENOBUFS; + goto bad; + } else { + cm = mtod(control, struct cmsghdr *); + cm->cmsg_len = control->m_len; + cm->cmsg_level = SOL_SOCKET; + cm->cmsg_type = SCM_RIGHTS; + } + } +#endif + } else + control = 0; +#ifdef KTRACE + if (KTRPOINT(p, KTR_GENIO)) { + int iovlen = auio.uio_iovcnt * sizeof (struct iovec); + + MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK); + bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen); + } +#endif + len = auio.uio_resid; + if (error = sosend((struct socket *)fp->f_data, to, &auio, + (struct mbuf *)0, control, flags)) { + if (auio.uio_resid != len && (error == ERESTART || + error == EINTR || error == EWOULDBLOCK)) + error = 0; + if (error == EPIPE) + psignal(p, SIGPIPE); + } + if (error == 0) + *retsize = len - auio.uio_resid; +#ifdef KTRACE + if (ktriov != NULL) { + if (error == 0) + ktrgenio(p->p_tracep, s, UIO_WRITE, + ktriov, *retsize, error); + FREE(ktriov, M_TEMP); + } +#endif +bad: + if (to) + m_freem(to); + return (error); +} + +#ifdef COMPAT_OLDSOCK +int +compat_43_recvfrom(p, uap, retval) + struct proc *p; + struct recvfrom_args /* { + syscallarg(int) s; + syscallarg(caddr_t) buf; + syscallarg(size_t) len; + syscallarg(int) flags; + syscallarg(caddr_t) from; + syscallarg(int *) fromlenaddr; + } */ *uap; + register_t *retval; +{ + + SCARG(uap, flags) |= MSG_COMPAT; + return (recvfrom(p, uap, retval)); +} +#endif + +int +recvfrom(p, uap, retval) + struct proc *p; + register struct recvfrom_args /* { + syscallarg(int) s; + syscallarg(caddr_t) buf; + syscallarg(size_t) len; + syscallarg(int) flags; + syscallarg(caddr_t) from; + syscallarg(int *) fromlenaddr; + } */ *uap; + register_t *retval; +{ + struct msghdr msg; + struct iovec aiov; + int error; + + if (SCARG(uap, fromlenaddr)) { + if (error = copyin((caddr_t)SCARG(uap, fromlenaddr), + (caddr_t)&msg.msg_namelen, sizeof (msg.msg_namelen))) + return (error); + } else + msg.msg_namelen = 0; + msg.msg_name = SCARG(uap, from); + msg.msg_iov = &aiov; + msg.msg_iovlen = 1; + aiov.iov_base = SCARG(uap, buf); + aiov.iov_len = SCARG(uap, len); + msg.msg_control = 0; + msg.msg_flags = SCARG(uap, flags); + return (recvit(p, SCARG(uap, s), &msg, + (caddr_t)SCARG(uap, fromlenaddr), retval)); +} + +#ifdef COMPAT_OLDSOCK +int +compat_43_recv(p, uap, retval) + struct proc *p; + register struct compat_43_recv_args /* { + syscallarg(int) s; + syscallarg(caddr_t) buf; + syscallarg(int) len; + syscallarg(int) flags; + } */ *uap; + register_t *retval; +{ + struct msghdr msg; + struct iovec aiov; + + msg.msg_name = 0; + msg.msg_namelen = 0; + msg.msg_iov = &aiov; + msg.msg_iovlen = 1; + aiov.iov_base = SCARG(uap, buf); + aiov.iov_len = SCARG(uap, len); + msg.msg_control = 0; + msg.msg_flags = SCARG(uap, flags); + return (recvit(p, SCARG(uap, s), &msg, (caddr_t)0, retval)); +} + +/* + * Old recvmsg. This code takes advantage of the fact that the old msghdr + * overlays the new one, missing only the flags, and with the (old) access + * rights where the control fields are now. + */ +int +compat_43_recvmsg(p, uap, retval) + struct proc *p; + register struct compat_43_recvmsg_args /* { + syscallarg(int) s; + syscallarg(struct omsghdr *) msg; + syscallarg(int) flags; + } */ *uap; + register_t *retval; +{ + struct msghdr msg; + struct iovec aiov[UIO_SMALLIOV], *iov; + int error; + + if (error = copyin((caddr_t)SCARG(uap, msg), (caddr_t)&msg, + sizeof (struct omsghdr))) + return (error); + if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) { + if ((u_int)msg.msg_iovlen >= UIO_MAXIOV) + return (EMSGSIZE); + MALLOC(iov, struct iovec *, + sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV, + M_WAITOK); + } else + iov = aiov; + msg.msg_flags = SCARG(uap, flags) | MSG_COMPAT; + if (error = copyin((caddr_t)msg.msg_iov, (caddr_t)iov, + (unsigned)(msg.msg_iovlen * sizeof (struct iovec)))) + goto done; + msg.msg_iov = iov; + error = recvit(p, SCARG(uap, s), &msg, + (caddr_t)&SCARG(uap, msg)->msg_namelen, retval); + + if (msg.msg_controllen && error == 0) + error = copyout((caddr_t)&msg.msg_controllen, + (caddr_t)&SCARG(uap, msg)->msg_accrightslen, sizeof (int)); +done: + if (iov != aiov) + FREE(iov, M_IOV); + return (error); +} +#endif + +int +recvmsg(p, uap, retval) + struct proc *p; + register struct recvmsg_args /* { + syscallarg(int) s; + syscallarg(struct msghdr *) msg; + syscallarg(int) flags; + } */ *uap; + register_t *retval; +{ + struct msghdr msg; + struct iovec aiov[UIO_SMALLIOV], *uiov, *iov; + register int error; + + if (error = copyin((caddr_t)SCARG(uap, msg), (caddr_t)&msg, + sizeof (msg))) + return (error); + if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) { + if ((u_int)msg.msg_iovlen >= UIO_MAXIOV) + return (EMSGSIZE); + MALLOC(iov, struct iovec *, + sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV, + M_WAITOK); + } else + iov = aiov; +#ifdef COMPAT_OLDSOCK + msg.msg_flags = SCARG(uap, flags) &~ MSG_COMPAT; +#else + msg.msg_flags = SCARG(uap, flags); +#endif + uiov = msg.msg_iov; + msg.msg_iov = iov; + if (error = copyin((caddr_t)uiov, (caddr_t)iov, + (unsigned)(msg.msg_iovlen * sizeof (struct iovec)))) + goto done; + if ((error = recvit(p, SCARG(uap, s), &msg, (caddr_t)0, retval)) == 0) { + msg.msg_iov = uiov; + error = copyout((caddr_t)&msg, (caddr_t)SCARG(uap, msg), + sizeof(msg)); + } +done: + if (iov != aiov) + FREE(iov, M_IOV); + return (error); +} + +int +recvit(p, s, mp, namelenp, retsize) + register struct proc *p; + int s; + register struct msghdr *mp; + caddr_t namelenp; + register_t *retsize; +{ + struct file *fp; + struct uio auio; + register struct iovec *iov; + register int i; + int len, error; + struct mbuf *from = 0, *control = 0; +#ifdef KTRACE + struct iovec *ktriov = NULL; +#endif + + if (error = getsock(p->p_fd, s, &fp)) + return (error); + auio.uio_iov = mp->msg_iov; + auio.uio_iovcnt = mp->msg_iovlen; + auio.uio_segflg = UIO_USERSPACE; + auio.uio_rw = UIO_READ; + auio.uio_procp = p; + auio.uio_offset = 0; /* XXX */ + auio.uio_resid = 0; + iov = mp->msg_iov; + for (i = 0; i < mp->msg_iovlen; i++, iov++) { + if (auio.uio_resid + iov->iov_len < auio.uio_resid) + return (EINVAL); + auio.uio_resid += iov->iov_len; + } +#ifdef KTRACE + if (KTRPOINT(p, KTR_GENIO)) { + int iovlen = auio.uio_iovcnt * sizeof (struct iovec); + + MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK); + bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen); + } +#endif + len = auio.uio_resid; + if (error = soreceive((struct socket *)fp->f_data, &from, &auio, + (struct mbuf **)0, mp->msg_control ? &control : (struct mbuf **)0, + &mp->msg_flags)) { + if (auio.uio_resid != len && (error == ERESTART || + error == EINTR || error == EWOULDBLOCK)) + error = 0; + } +#ifdef KTRACE + if (ktriov != NULL) { + if (error == 0) + ktrgenio(p->p_tracep, s, UIO_READ, + ktriov, len - auio.uio_resid, error); + FREE(ktriov, M_TEMP); + } +#endif + if (error) + goto out; + *retsize = len - auio.uio_resid; + if (mp->msg_name) { + len = mp->msg_namelen; + if (len <= 0 || from == 0) + len = 0; + else { +#ifdef COMPAT_OLDSOCK + if (mp->msg_flags & MSG_COMPAT) + mtod(from, struct osockaddr *)->sa_family = + mtod(from, struct sockaddr *)->sa_family; +#endif + if (len > from->m_len) + len = from->m_len; + /* else if len < from->m_len ??? */ + if (error = copyout(mtod(from, caddr_t), + (caddr_t)mp->msg_name, (unsigned)len)) + goto out; + } + mp->msg_namelen = len; + if (namelenp && + (error = copyout((caddr_t)&len, namelenp, sizeof (int)))) { +#ifdef COMPAT_OLDSOCK + if (mp->msg_flags & MSG_COMPAT) + error = 0; /* old recvfrom didn't check */ + else +#endif + goto out; + } + } + if (mp->msg_control) { +#ifdef COMPAT_OLDSOCK + /* + * We assume that old recvmsg calls won't receive access + * rights and other control info, esp. as control info + * is always optional and those options didn't exist in 4.3. + * If we receive rights, trim the cmsghdr; anything else + * is tossed. + */ + if (control && mp->msg_flags & MSG_COMPAT) { + if (mtod(control, struct cmsghdr *)->cmsg_level != + SOL_SOCKET || + mtod(control, struct cmsghdr *)->cmsg_type != + SCM_RIGHTS) { + mp->msg_controllen = 0; + goto out; + } + control->m_len -= sizeof (struct cmsghdr); + control->m_data += sizeof (struct cmsghdr); + } +#endif + len = mp->msg_controllen; + if (len <= 0 || control == 0) + len = 0; + else { + if (len >= control->m_len) + len = control->m_len; + else + mp->msg_flags |= MSG_CTRUNC; + error = copyout((caddr_t)mtod(control, caddr_t), + (caddr_t)mp->msg_control, (unsigned)len); + } + mp->msg_controllen = len; + } +out: + if (from) + m_freem(from); + if (control) + m_freem(control); + return (error); +} + +/* ARGSUSED */ +int +shutdown(p, uap, retval) + struct proc *p; + register struct shutdown_args /* { + syscallarg(int) s; + syscallarg(int) how; + } */ *uap; + register_t *retval; +{ + struct file *fp; + int error; + + if (error = getsock(p->p_fd, SCARG(uap, s), &fp)) + return (error); + return (soshutdown((struct socket *)fp->f_data, SCARG(uap, how))); +} + +/* ARGSUSED */ +int +setsockopt(p, uap, retval) + struct proc *p; + register struct setsockopt_args /* { + syscallarg(int) s; + syscallarg(int) level; + syscallarg(int) name; + syscallarg(caddr_t) val; + syscallarg(int) valsize; + } */ *uap; + register_t *retval; +{ + struct file *fp; + struct mbuf *m = NULL; + int error; + + if (error = getsock(p->p_fd, SCARG(uap, s), &fp)) + return (error); + if (SCARG(uap, valsize) > MLEN) + return (EINVAL); + if (SCARG(uap, val)) { + m = m_get(M_WAIT, MT_SOOPTS); + if (m == NULL) + return (ENOBUFS); + if (error = copyin(SCARG(uap, val), mtod(m, caddr_t), + (u_int)SCARG(uap, valsize))) { + (void) m_free(m); + return (error); + } + m->m_len = SCARG(uap, valsize); + } + return (sosetopt((struct socket *)fp->f_data, SCARG(uap, level), + SCARG(uap, name), m)); +} + +/* ARGSUSED */ +int +getsockopt(p, uap, retval) + struct proc *p; + register struct getsockopt_args /* { + syscallarg(int) s; + syscallarg(int) level; + syscallarg(int) name; + syscallarg(caddr_t) val; + syscallarg(int *) avalsize; + } */ *uap; + register_t *retval; +{ + struct file *fp; + struct mbuf *m = NULL; + int valsize, error; + + if (error = getsock(p->p_fd, SCARG(uap, s), &fp)) + return (error); + if (SCARG(uap, val)) { + if (error = copyin((caddr_t)SCARG(uap, avalsize), + (caddr_t)&valsize, sizeof (valsize))) + return (error); + } else + valsize = 0; + if ((error = sogetopt((struct socket *)fp->f_data, SCARG(uap, level), + SCARG(uap, name), &m)) == 0 && SCARG(uap, val) && valsize && + m != NULL) { + if (valsize > m->m_len) + valsize = m->m_len; + error = copyout(mtod(m, caddr_t), SCARG(uap, val), + (u_int)valsize); + if (error == 0) + error = copyout((caddr_t)&valsize, + (caddr_t)SCARG(uap, avalsize), sizeof (valsize)); + } + if (m != NULL) + (void) m_free(m); + return (error); +} + +/* ARGSUSED */ +int +pipe(p, uap, retval) + struct proc *p; + void *uap; + register_t *retval; +{ + register struct filedesc *fdp = p->p_fd; + struct file *rf, *wf; + struct socket *rso, *wso; + int fd, error; + + if (error = socreate(AF_UNIX, &rso, SOCK_STREAM, 0)) + return (error); + if (error = socreate(AF_UNIX, &wso, SOCK_STREAM, 0)) + goto free1; + if (error = falloc(p, &rf, &fd)) + goto free2; + retval[0] = fd; + rf->f_flag = FREAD; + rf->f_type = DTYPE_SOCKET; + rf->f_ops = &socketops; + rf->f_data = (caddr_t)rso; + if (error = falloc(p, &wf, &fd)) + goto free3; + wf->f_flag = FWRITE; + wf->f_type = DTYPE_SOCKET; + wf->f_ops = &socketops; + wf->f_data = (caddr_t)wso; + retval[1] = fd; + if (error = unp_connect2(wso, rso)) + goto free4; + return (0); +free4: + ffree(wf); + fdp->fd_ofiles[retval[1]] = 0; +free3: + ffree(rf); + fdp->fd_ofiles[retval[0]] = 0; +free2: + (void)soclose(wso); +free1: + (void)soclose(rso); + return (error); +} + +/* + * Get socket name. + */ +#ifdef COMPAT_OLDSOCK +int +getsockname(p, uap, retval) + struct proc *p; + struct getsockname_args /* { + syscallarg(int) fdes; + syscallarg(caddr_t) asa; + syscallarg(int *) alen; + } */ *uap; + register_t *retval; +{ + + return (getsockname1(p, uap, retval, 0)); +} + +int +compat_43_getsockname(p, uap, retval) + struct proc *p; + struct getsockname_args /* { + syscallarg(int) fdes; + syscallarg(caddr_t) asa; + syscallarg(int *) alen; + } */ *uap; + register_t *retval; +{ + + return (getsockname1(p, uap, retval, 1)); +} +#else /* COMPAT_OLDSOCK */ + +#define getsockname1 getsockname +#endif + +/* ARGSUSED */ +int +getsockname1(p, uap, retval, compat_43) + struct proc *p; + register struct getsockname_args /* { + syscallarg(int) fdes; + syscallarg(caddr_t) asa; + syscallarg(int *) alen; + } */ *uap; + register_t *retval; + int compat_43; +{ + struct file *fp; + register struct socket *so; + struct mbuf *m; + int len, error; + + if (error = getsock(p->p_fd, SCARG(uap, fdes), &fp)) + return (error); + if (error = copyin((caddr_t)SCARG(uap, alen), (caddr_t)&len, + sizeof (len))) + return (error); + so = (struct socket *)fp->f_data; + m = m_getclr(M_WAIT, MT_SONAME); + if (m == NULL) + return (ENOBUFS); + if (error = (*so->so_proto->pr_usrreq)(so, PRU_SOCKADDR, 0, m, 0)) + goto bad; + if (len > m->m_len) + len = m->m_len; +#ifdef COMPAT_OLDSOCK + if (compat_43) + mtod(m, struct osockaddr *)->sa_family = + mtod(m, struct sockaddr *)->sa_family; +#endif + error = copyout(mtod(m, caddr_t), (caddr_t)SCARG(uap, asa), (u_int)len); + if (error == 0) + error = copyout((caddr_t)&len, (caddr_t)SCARG(uap, alen), + sizeof (len)); +bad: + m_freem(m); + return (error); +} + +/* + * Get name of peer for connected socket. + */ +#ifdef COMPAT_OLDSOCK +int +getpeername(p, uap, retval) + struct proc *p; + struct getpeername_args /* { + syscallarg(int) fdes; + syscallarg(caddr_t) asa; + syscallarg(int *) alen; + } */ *uap; + register_t *retval; +{ + + return (getpeername1(p, uap, retval, 0)); +} + +int +compat_43_getpeername(p, uap, retval) + struct proc *p; + struct getpeername_args /* { + syscallarg(int) fdes; + syscallarg(caddr_t) asa; + syscallarg(int *) alen; + } */ *uap; + register_t *retval; +{ + + return (getpeername1(p, uap, retval, 1)); +} +#else /* COMPAT_OLDSOCK */ + +#define getpeername1 getpeername +#endif + +/* ARGSUSED */ +int +getpeername1(p, uap, retval, compat_43) + struct proc *p; + register struct getpeername_args /* { + syscallarg(int) fdes; + syscallarg(caddr_t) asa; + syscallarg(int *) alen; + } */ *uap; + register_t *retval; + int compat_43; +{ + struct file *fp; + register struct socket *so; + struct mbuf *m; + int len, error; + + if (error = getsock(p->p_fd, SCARG(uap, fdes), &fp)) + return (error); + so = (struct socket *)fp->f_data; + if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0) + return (ENOTCONN); + if (error = + copyin((caddr_t)SCARG(uap, alen), (caddr_t)&len, sizeof (len))) + return (error); + m = m_getclr(M_WAIT, MT_SONAME); + if (m == NULL) + return (ENOBUFS); + if (error = (*so->so_proto->pr_usrreq)(so, PRU_PEERADDR, 0, m, 0)) + goto bad; + if (len > m->m_len) + len = m->m_len; +#ifdef COMPAT_OLDSOCK + if (compat_43) + mtod(m, struct osockaddr *)->sa_family = + mtod(m, struct sockaddr *)->sa_family; +#endif + if (error = + copyout(mtod(m, caddr_t), (caddr_t)SCARG(uap, asa), (u_int)len)) + goto bad; + error = copyout((caddr_t)&len, (caddr_t)SCARG(uap, alen), sizeof (len)); +bad: + m_freem(m); + return (error); +} + +int +sockargs(mp, buf, buflen, type) + struct mbuf **mp; + caddr_t buf; + int buflen, type; +{ + register struct sockaddr *sa; + register struct mbuf *m; + int error; + + if ((u_int)buflen > MLEN) { +#ifdef COMPAT_OLDSOCK + if (type == MT_SONAME && (u_int)buflen <= 112) + buflen = MLEN; /* unix domain compat. hack */ + else +#endif + return (EINVAL); + } + m = m_get(M_WAIT, type); + if (m == NULL) + return (ENOBUFS); + m->m_len = buflen; + error = copyin(buf, mtod(m, caddr_t), (u_int)buflen); + if (error) { + (void) m_free(m); + return (error); + } + *mp = m; + if (type == MT_SONAME) { + sa = mtod(m, struct sockaddr *); + +#if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN + if (sa->sa_family == 0 && sa->sa_len < AF_MAX) + sa->sa_family = sa->sa_len; +#endif + sa->sa_len = buflen; + } + return (0); +} + +int +getsock(fdp, fdes, fpp) + struct filedesc *fdp; + int fdes; + struct file **fpp; +{ + register struct file *fp; + + if ((unsigned)fdes >= fdp->fd_nfiles || + (fp = fdp->fd_ofiles[fdes]) == NULL) + return (EBADF); + if (fp->f_type != DTYPE_SOCKET) + return (ENOTSOCK); + *fpp = fp; + return (0); +} diff --git a/sys/kern/uipc_usrreq.c b/sys/kern/uipc_usrreq.c new file mode 100644 index 000000000000..c6bcbfd9e2ed --- /dev/null +++ b/sys/kern/uipc_usrreq.c @@ -0,0 +1,839 @@ +/* + * Copyright (c) 1982, 1986, 1989, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)uipc_usrreq.c 8.9 (Berkeley) 5/14/95 + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/proc.h> +#include <sys/filedesc.h> +#include <sys/domain.h> +#include <sys/protosw.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/unpcb.h> +#include <sys/un.h> +#include <sys/namei.h> +#include <sys/vnode.h> +#include <sys/file.h> +#include <sys/stat.h> +#include <sys/mbuf.h> + +/* + * Unix communications domain. + * + * TODO: + * SEQPACKET, RDM + * rethink name space problems + * need a proper out-of-band + */ +struct sockaddr sun_noname = { sizeof(sun_noname), AF_UNIX }; +ino_t unp_ino; /* prototype for fake inode numbers */ + +/*ARGSUSED*/ +int +uipc_usrreq(so, req, m, nam, control) + struct socket *so; + int req; + struct mbuf *m, *nam, *control; +{ + struct unpcb *unp = sotounpcb(so); + register struct socket *so2; + register int error = 0; + struct proc *p = curproc; /* XXX */ + + if (req == PRU_CONTROL) + return (EOPNOTSUPP); + if (req != PRU_SEND && control && control->m_len) { + error = EOPNOTSUPP; + goto release; + } + if (unp == 0 && req != PRU_ATTACH) { + error = EINVAL; + goto release; + } + switch (req) { + + case PRU_ATTACH: + if (unp) { + error = EISCONN; + break; + } + error = unp_attach(so); + break; + + case PRU_DETACH: + unp_detach(unp); + break; + + case PRU_BIND: + error = unp_bind(unp, nam, p); + break; + + case PRU_LISTEN: + if (unp->unp_vnode == 0) + error = EINVAL; + break; + + case PRU_CONNECT: + error = unp_connect(so, nam, p); + break; + + case PRU_CONNECT2: + error = unp_connect2(so, (struct socket *)nam); + break; + + case PRU_DISCONNECT: + unp_disconnect(unp); + break; + + case PRU_ACCEPT: + /* + * Pass back name of connected socket, + * if it was bound and we are still connected + * (our peer may have closed already!). + */ + if (unp->unp_conn && unp->unp_conn->unp_addr) { + nam->m_len = unp->unp_conn->unp_addr->m_len; + bcopy(mtod(unp->unp_conn->unp_addr, caddr_t), + mtod(nam, caddr_t), (unsigned)nam->m_len); + } else { + nam->m_len = sizeof(sun_noname); + *(mtod(nam, struct sockaddr *)) = sun_noname; + } + break; + + case PRU_SHUTDOWN: + socantsendmore(so); + unp_shutdown(unp); + break; + + case PRU_RCVD: + switch (so->so_type) { + + case SOCK_DGRAM: + panic("uipc 1"); + /*NOTREACHED*/ + + case SOCK_STREAM: +#define rcv (&so->so_rcv) +#define snd (&so2->so_snd) + if (unp->unp_conn == 0) + break; + so2 = unp->unp_conn->unp_socket; + /* + * Adjust backpressure on sender + * and wakeup any waiting to write. + */ + snd->sb_mbmax += unp->unp_mbcnt - rcv->sb_mbcnt; + unp->unp_mbcnt = rcv->sb_mbcnt; + snd->sb_hiwat += unp->unp_cc - rcv->sb_cc; + unp->unp_cc = rcv->sb_cc; + sowwakeup(so2); +#undef snd +#undef rcv + break; + + default: + panic("uipc 2"); + } + break; + + case PRU_SEND: + if (control && (error = unp_internalize(control, p))) + break; + switch (so->so_type) { + + case SOCK_DGRAM: { + struct sockaddr *from; + + if (nam) { + if (unp->unp_conn) { + error = EISCONN; + break; + } + error = unp_connect(so, nam, p); + if (error) + break; + } else { + if (unp->unp_conn == 0) { + error = ENOTCONN; + break; + } + } + so2 = unp->unp_conn->unp_socket; + if (unp->unp_addr) + from = mtod(unp->unp_addr, struct sockaddr *); + else + from = &sun_noname; + if (sbappendaddr(&so2->so_rcv, from, m, control)) { + sorwakeup(so2); + m = 0; + control = 0; + } else + error = ENOBUFS; + if (nam) + unp_disconnect(unp); + break; + } + + case SOCK_STREAM: +#define rcv (&so2->so_rcv) +#define snd (&so->so_snd) + if (so->so_state & SS_CANTSENDMORE) { + error = EPIPE; + break; + } + if (unp->unp_conn == 0) + panic("uipc 3"); + so2 = unp->unp_conn->unp_socket; + /* + * Send to paired receive port, and then reduce + * send buffer hiwater marks to maintain backpressure. + * Wake up readers. + */ + if (control) { + if (sbappendcontrol(rcv, m, control)) + control = 0; + } else + sbappend(rcv, m); + snd->sb_mbmax -= + rcv->sb_mbcnt - unp->unp_conn->unp_mbcnt; + unp->unp_conn->unp_mbcnt = rcv->sb_mbcnt; + snd->sb_hiwat -= rcv->sb_cc - unp->unp_conn->unp_cc; + unp->unp_conn->unp_cc = rcv->sb_cc; + sorwakeup(so2); + m = 0; +#undef snd +#undef rcv + break; + + default: + panic("uipc 4"); + } + break; + + case PRU_ABORT: + unp_drop(unp, ECONNABORTED); + break; + + case PRU_SENSE: + ((struct stat *) m)->st_blksize = so->so_snd.sb_hiwat; + if (so->so_type == SOCK_STREAM && unp->unp_conn != 0) { + so2 = unp->unp_conn->unp_socket; + ((struct stat *) m)->st_blksize += so2->so_rcv.sb_cc; + } + ((struct stat *) m)->st_dev = NODEV; + if (unp->unp_ino == 0) + unp->unp_ino = unp_ino++; + ((struct stat *) m)->st_ino = unp->unp_ino; + return (0); + + case PRU_RCVOOB: + return (EOPNOTSUPP); + + case PRU_SENDOOB: + error = EOPNOTSUPP; + break; + + case PRU_SOCKADDR: + if (unp->unp_addr) { + nam->m_len = unp->unp_addr->m_len; + bcopy(mtod(unp->unp_addr, caddr_t), + mtod(nam, caddr_t), (unsigned)nam->m_len); + } else + nam->m_len = 0; + break; + + case PRU_PEERADDR: + if (unp->unp_conn && unp->unp_conn->unp_addr) { + nam->m_len = unp->unp_conn->unp_addr->m_len; + bcopy(mtod(unp->unp_conn->unp_addr, caddr_t), + mtod(nam, caddr_t), (unsigned)nam->m_len); + } else + nam->m_len = 0; + break; + + case PRU_SLOWTIMO: + break; + + default: + panic("piusrreq"); + } +release: + if (control) + m_freem(control); + if (m) + m_freem(m); + return (error); +} + +/* + * Both send and receive buffers are allocated PIPSIZ bytes of buffering + * for stream sockets, although the total for sender and receiver is + * actually only PIPSIZ. + * Datagram sockets really use the sendspace as the maximum datagram size, + * and don't really want to reserve the sendspace. Their recvspace should + * be large enough for at least one max-size datagram plus address. + */ +#define PIPSIZ 4096 +u_long unpst_sendspace = PIPSIZ; +u_long unpst_recvspace = PIPSIZ; +u_long unpdg_sendspace = 2*1024; /* really max datagram size */ +u_long unpdg_recvspace = 4*1024; + +int unp_rights; /* file descriptors in flight */ + +int +unp_attach(so) + struct socket *so; +{ + register struct mbuf *m; + register struct unpcb *unp; + int error; + + if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) { + switch (so->so_type) { + + case SOCK_STREAM: + error = soreserve(so, unpst_sendspace, unpst_recvspace); + break; + + case SOCK_DGRAM: + error = soreserve(so, unpdg_sendspace, unpdg_recvspace); + break; + + default: + panic("unp_attach"); + } + if (error) + return (error); + } + m = m_getclr(M_DONTWAIT, MT_PCB); + if (m == NULL) + return (ENOBUFS); + unp = mtod(m, struct unpcb *); + so->so_pcb = (caddr_t)unp; + unp->unp_socket = so; + return (0); +} + +void +unp_detach(unp) + register struct unpcb *unp; +{ + + if (unp->unp_vnode) { + unp->unp_vnode->v_socket = 0; + vrele(unp->unp_vnode); + unp->unp_vnode = 0; + } + if (unp->unp_conn) + unp_disconnect(unp); + while (unp->unp_refs) + unp_drop(unp->unp_refs, ECONNRESET); + soisdisconnected(unp->unp_socket); + unp->unp_socket->so_pcb = 0; + m_freem(unp->unp_addr); + (void) m_free(dtom(unp)); + if (unp_rights) { + /* + * Normally the receive buffer is flushed later, + * in sofree, but if our receive buffer holds references + * to descriptors that are now garbage, we will dispose + * of those descriptor references after the garbage collector + * gets them (resulting in a "panic: closef: count < 0"). + */ + sorflush(unp->unp_socket); + unp_gc(); + } +} + +int +unp_bind(unp, nam, p) + struct unpcb *unp; + struct mbuf *nam; + struct proc *p; +{ + struct sockaddr_un *soun = mtod(nam, struct sockaddr_un *); + register struct vnode *vp; + struct vattr vattr; + int error; + struct nameidata nd; + + NDINIT(&nd, CREATE, FOLLOW | LOCKPARENT, UIO_SYSSPACE, + soun->sun_path, p); + if (unp->unp_vnode != NULL) + return (EINVAL); + if (nam->m_len == MLEN) { + if (*(mtod(nam, caddr_t) + nam->m_len - 1) != 0) + return (EINVAL); + } else + *(mtod(nam, caddr_t) + nam->m_len) = 0; +/* SHOULD BE ABLE TO ADOPT EXISTING AND wakeup() ALA FIFO's */ + if (error = namei(&nd)) + return (error); + vp = nd.ni_vp; + if (vp != NULL) { + VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); + if (nd.ni_dvp == vp) + vrele(nd.ni_dvp); + else + vput(nd.ni_dvp); + vrele(vp); + return (EADDRINUSE); + } + VATTR_NULL(&vattr); + vattr.va_type = VSOCK; + vattr.va_mode = ACCESSPERMS; + VOP_LEASE(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE); + if (error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr)) + return (error); + vp = nd.ni_vp; + vp->v_socket = unp->unp_socket; + unp->unp_vnode = vp; + unp->unp_addr = m_copy(nam, 0, (int)M_COPYALL); + VOP_UNLOCK(vp, 0, p); + return (0); +} + +int +unp_connect(so, nam, p) + struct socket *so; + struct mbuf *nam; + struct proc *p; +{ + register struct sockaddr_un *soun = mtod(nam, struct sockaddr_un *); + register struct vnode *vp; + register struct socket *so2, *so3; + struct unpcb *unp2, *unp3; + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, soun->sun_path, p); + if (nam->m_data + nam->m_len == &nam->m_dat[MLEN]) { /* XXX */ + if (*(mtod(nam, caddr_t) + nam->m_len - 1) != 0) + return (EMSGSIZE); + } else + *(mtod(nam, caddr_t) + nam->m_len) = 0; + if (error = namei(&nd)) + return (error); + vp = nd.ni_vp; + if (vp->v_type != VSOCK) { + error = ENOTSOCK; + goto bad; + } + if (error = VOP_ACCESS(vp, VWRITE, p->p_ucred, p)) + goto bad; + so2 = vp->v_socket; + if (so2 == 0) { + error = ECONNREFUSED; + goto bad; + } + if (so->so_type != so2->so_type) { + error = EPROTOTYPE; + goto bad; + } + if (so->so_proto->pr_flags & PR_CONNREQUIRED) { + if ((so2->so_options & SO_ACCEPTCONN) == 0 || + (so3 = sonewconn(so2, 0)) == 0) { + error = ECONNREFUSED; + goto bad; + } + unp2 = sotounpcb(so2); + unp3 = sotounpcb(so3); + if (unp2->unp_addr) + unp3->unp_addr = + m_copy(unp2->unp_addr, 0, (int)M_COPYALL); + so2 = so3; + } + error = unp_connect2(so, so2); +bad: + vput(vp); + return (error); +} + +int +unp_connect2(so, so2) + register struct socket *so; + register struct socket *so2; +{ + register struct unpcb *unp = sotounpcb(so); + register struct unpcb *unp2; + + if (so2->so_type != so->so_type) + return (EPROTOTYPE); + unp2 = sotounpcb(so2); + unp->unp_conn = unp2; + switch (so->so_type) { + + case SOCK_DGRAM: + unp->unp_nextref = unp2->unp_refs; + unp2->unp_refs = unp; + soisconnected(so); + break; + + case SOCK_STREAM: + unp2->unp_conn = unp; + soisconnected(so); + soisconnected(so2); + break; + + default: + panic("unp_connect2"); + } + return (0); +} + +void +unp_disconnect(unp) + struct unpcb *unp; +{ + register struct unpcb *unp2 = unp->unp_conn; + + if (unp2 == 0) + return; + unp->unp_conn = 0; + switch (unp->unp_socket->so_type) { + + case SOCK_DGRAM: + if (unp2->unp_refs == unp) + unp2->unp_refs = unp->unp_nextref; + else { + unp2 = unp2->unp_refs; + for (;;) { + if (unp2 == 0) + panic("unp_disconnect"); + if (unp2->unp_nextref == unp) + break; + unp2 = unp2->unp_nextref; + } + unp2->unp_nextref = unp->unp_nextref; + } + unp->unp_nextref = 0; + unp->unp_socket->so_state &= ~SS_ISCONNECTED; + break; + + case SOCK_STREAM: + soisdisconnected(unp->unp_socket); + unp2->unp_conn = 0; + soisdisconnected(unp2->unp_socket); + break; + } +} + +#ifdef notdef +void +unp_abort(unp) + struct unpcb *unp; +{ + + unp_detach(unp); +} +#endif + +void +unp_shutdown(unp) + struct unpcb *unp; +{ + struct socket *so; + + if (unp->unp_socket->so_type == SOCK_STREAM && unp->unp_conn && + (so = unp->unp_conn->unp_socket)) + socantrcvmore(so); +} + +void +unp_drop(unp, errno) + struct unpcb *unp; + int errno; +{ + struct socket *so = unp->unp_socket; + + so->so_error = errno; + unp_disconnect(unp); + if (so->so_head) { + so->so_pcb = (caddr_t) 0; + m_freem(unp->unp_addr); + (void) m_free(dtom(unp)); + sofree(so); + } +} + +#ifdef notdef +unp_drain() +{ + +} +#endif + +int +unp_externalize(rights) + struct mbuf *rights; +{ + struct proc *p = curproc; /* XXX */ + register int i; + register struct cmsghdr *cm = mtod(rights, struct cmsghdr *); + register struct file **rp = (struct file **)(cm + 1); + register struct file *fp; + int newfds = (cm->cmsg_len - sizeof(*cm)) / sizeof (int); + int f; + + if (!fdavail(p, newfds)) { + for (i = 0; i < newfds; i++) { + fp = *rp; + unp_discard(fp); + *rp++ = 0; + } + return (EMSGSIZE); + } + for (i = 0; i < newfds; i++) { + if (fdalloc(p, 0, &f)) + panic("unp_externalize"); + fp = *rp; + p->p_fd->fd_ofiles[f] = fp; + fp->f_msgcount--; + unp_rights--; + *(int *)rp++ = f; + } + return (0); +} + +int +unp_internalize(control, p) + struct mbuf *control; + struct proc *p; +{ + struct filedesc *fdp = p->p_fd; + register struct cmsghdr *cm = mtod(control, struct cmsghdr *); + register struct file **rp; + register struct file *fp; + register int i, fd; + int oldfds; + + if (cm->cmsg_type != SCM_RIGHTS || cm->cmsg_level != SOL_SOCKET || + cm->cmsg_len != control->m_len) + return (EINVAL); + oldfds = (cm->cmsg_len - sizeof (*cm)) / sizeof (int); + rp = (struct file **)(cm + 1); + for (i = 0; i < oldfds; i++) { + fd = *(int *)rp++; + if ((unsigned)fd >= fdp->fd_nfiles || + fdp->fd_ofiles[fd] == NULL) + return (EBADF); + } + rp = (struct file **)(cm + 1); + for (i = 0; i < oldfds; i++) { + fp = fdp->fd_ofiles[*(int *)rp]; + *rp++ = fp; + fp->f_count++; + fp->f_msgcount++; + unp_rights++; + } + return (0); +} + +int unp_defer, unp_gcing; +extern struct domain unixdomain; + +void +unp_gc() +{ + register struct file *fp, *nextfp; + register struct socket *so; + struct file **extra_ref, **fpp; + int nunref, i; + + if (unp_gcing) + return; + unp_gcing = 1; + unp_defer = 0; + for (fp = filehead.lh_first; fp != 0; fp = fp->f_list.le_next) + fp->f_flag &= ~(FMARK|FDEFER); + do { + for (fp = filehead.lh_first; fp != 0; fp = fp->f_list.le_next) { + if (fp->f_count == 0) + continue; + if (fp->f_flag & FDEFER) { + fp->f_flag &= ~FDEFER; + unp_defer--; + } else { + if (fp->f_flag & FMARK) + continue; + if (fp->f_count == fp->f_msgcount) + continue; + fp->f_flag |= FMARK; + } + if (fp->f_type != DTYPE_SOCKET || + (so = (struct socket *)fp->f_data) == 0) + continue; + if (so->so_proto->pr_domain != &unixdomain || + (so->so_proto->pr_flags&PR_RIGHTS) == 0) + continue; +#ifdef notdef + if (so->so_rcv.sb_flags & SB_LOCK) { + /* + * This is problematical; it's not clear + * we need to wait for the sockbuf to be + * unlocked (on a uniprocessor, at least), + * and it's also not clear what to do + * if sbwait returns an error due to receipt + * of a signal. If sbwait does return + * an error, we'll go into an infinite + * loop. Delete all of this for now. + */ + (void) sbwait(&so->so_rcv); + goto restart; + } +#endif + unp_scan(so->so_rcv.sb_mb, unp_mark); + } + } while (unp_defer); + /* + * We grab an extra reference to each of the file table entries + * that are not otherwise accessible and then free the rights + * that are stored in messages on them. + * + * The bug in the orginal code is a little tricky, so I'll describe + * what's wrong with it here. + * + * It is incorrect to simply unp_discard each entry for f_msgcount + * times -- consider the case of sockets A and B that contain + * references to each other. On a last close of some other socket, + * we trigger a gc since the number of outstanding rights (unp_rights) + * is non-zero. If during the sweep phase the gc code un_discards, + * we end up doing a (full) closef on the descriptor. A closef on A + * results in the following chain. Closef calls soo_close, which + * calls soclose. Soclose calls first (through the switch + * uipc_usrreq) unp_detach, which re-invokes unp_gc. Unp_gc simply + * returns because the previous instance had set unp_gcing, and + * we return all the way back to soclose, which marks the socket + * with SS_NOFDREF, and then calls sofree. Sofree calls sorflush + * to free up the rights that are queued in messages on the socket A, + * i.e., the reference on B. The sorflush calls via the dom_dispose + * switch unp_dispose, which unp_scans with unp_discard. This second + * instance of unp_discard just calls closef on B. + * + * Well, a similar chain occurs on B, resulting in a sorflush on B, + * which results in another closef on A. Unfortunately, A is already + * being closed, and the descriptor has already been marked with + * SS_NOFDREF, and soclose panics at this point. + * + * Here, we first take an extra reference to each inaccessible + * descriptor. Then, we call sorflush ourself, since we know + * it is a Unix domain socket anyhow. After we destroy all the + * rights carried in messages, we do a last closef to get rid + * of our extra reference. This is the last close, and the + * unp_detach etc will shut down the socket. + * + * 91/09/19, bsy@cs.cmu.edu + */ + extra_ref = malloc(nfiles * sizeof(struct file *), M_FILE, M_WAITOK); + for (nunref = 0, fp = filehead.lh_first, fpp = extra_ref; fp != 0; + fp = nextfp) { + nextfp = fp->f_list.le_next; + if (fp->f_count == 0) + continue; + if (fp->f_count == fp->f_msgcount && !(fp->f_flag & FMARK)) { + *fpp++ = fp; + nunref++; + fp->f_count++; + } + } + for (i = nunref, fpp = extra_ref; --i >= 0; ++fpp) + sorflush((struct socket *)(*fpp)->f_data); + for (i = nunref, fpp = extra_ref; --i >= 0; ++fpp) + closef(*fpp, (struct proc *)NULL); + free((caddr_t)extra_ref, M_FILE); + unp_gcing = 0; +} + +void +unp_dispose(m) + struct mbuf *m; +{ + + if (m) + unp_scan(m, unp_discard); +} + +void +unp_scan(m0, op) + register struct mbuf *m0; + void (*op) __P((struct file *)); +{ + register struct mbuf *m; + register struct file **rp; + register struct cmsghdr *cm; + register int i; + int qfds; + + while (m0) { + for (m = m0; m; m = m->m_next) + if (m->m_type == MT_CONTROL && + m->m_len >= sizeof(*cm)) { + cm = mtod(m, struct cmsghdr *); + if (cm->cmsg_level != SOL_SOCKET || + cm->cmsg_type != SCM_RIGHTS) + continue; + qfds = (cm->cmsg_len - sizeof *cm) + / sizeof (struct file *); + rp = (struct file **)(cm + 1); + for (i = 0; i < qfds; i++) + (*op)(*rp++); + break; /* XXX, but saves time */ + } + m0 = m0->m_act; + } +} + +void +unp_mark(fp) + struct file *fp; +{ + + if (fp->f_flag & FMARK) + return; + unp_defer++; + fp->f_flag |= (FMARK|FDEFER); +} + +void +unp_discard(fp) + struct file *fp; +{ + + fp->f_msgcount--; + unp_rights--; + (void) closef(fp, (struct proc *)NULL); +} diff --git a/sys/kern/vfs_cache.c b/sys/kern/vfs_cache.c new file mode 100644 index 000000000000..c20966bf7775 --- /dev/null +++ b/sys/kern/vfs_cache.c @@ -0,0 +1,320 @@ +/* + * Copyright (c) 1989, 1993, 1995 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Poul-Henning Kamp of the FreeBSD Project. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: vfs_cache.c,v 1.11 1995/03/12 02:01:20 phk Exp $ + * + * @(#)vfs_cache.c 8.5 (Berkeley) 3/22/95 + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/time.h> +#include <sys/mount.h> +#include <sys/vnode.h> +#include <sys/namei.h> +#include <sys/errno.h> +#include <sys/malloc.h> + +/* + * Name caching works as follows: + * + * Names found by directory scans are retained in a cache + * for future reference. It is managed LRU, so frequently + * used names will hang around. Cache is indexed by hash value + * obtained from (vp, name) where vp refers to the directory + * containing name. + * + * If it is a "negative" entry, (i.e. for a name that is known NOT to + * exist) the vnode pointer will be NULL. + * + * For simplicity (and economy of storage), names longer than + * a maximum length of NCHNAMLEN are not cached; they occur + * infrequently in any case, and are almost never of interest. + * + * Upon reaching the last segment of a path, if the reference + * is for DELETE, or NOCACHE is set (rewrite), and the + * name is located in the cache, it will be dropped. + */ + +/* + * Structures associated with name cacheing. + */ +#define NCHHASH(dvp, cnp) \ + (&nchashtbl[((dvp)->v_id + (cnp)->cn_hash) & nchash]) +LIST_HEAD(nchashhead, namecache) *nchashtbl; /* Hash Table */ +u_long nchash; /* size of hash table - 1 */ +long numcache; /* number of cache entries allocated */ +TAILQ_HEAD(, namecache) nclruhead; /* LRU chain */ +struct nchstats nchstats; /* cache effectiveness statistics */ + +int doingcache = 1; /* 1 => enable the cache */ + +/* + * Delete an entry from its hash list and move it to the front + * of the LRU list for immediate reuse. + */ +#define PURGE(ncp) { \ + LIST_REMOVE(ncp, nc_hash); \ + ncp->nc_hash.le_prev = 0; \ + TAILQ_REMOVE(&nclruhead, ncp, nc_lru); \ + TAILQ_INSERT_HEAD(&nclruhead, ncp, nc_lru); \ +} + +/* + * Move an entry that has been used to the tail of the LRU list + * so that it will be preserved for future use. + */ +#define TOUCH(ncp) { \ + if (ncp->nc_lru.tqe_next != 0) { \ + TAILQ_REMOVE(&nclruhead, ncp, nc_lru); \ + TAILQ_INSERT_TAIL(&nclruhead, ncp, nc_lru); \ + } \ +} + +/* + * Lookup an entry in the cache + * + * We don't do this if the segment name is long, simply so the cache + * can avoid holding long names (which would either waste space, or + * add greatly to the complexity). + * + * Lookup is called with dvp pointing to the directory to search, + * cnp pointing to the name of the entry being sought. If the lookup + * succeeds, the vnode is returned in *vpp, and a status of -1 is + * returned. If the lookup determines that the name does not exist + * (negative cacheing), a status of ENOENT is returned. If the lookup + * fails, a status of zero is returned. + */ + +int +cache_lookup(dvp, vpp, cnp) + struct vnode *dvp; + struct vnode **vpp; + struct componentname *cnp; +{ + register struct namecache *ncp, *nnp; + register struct nchashhead *ncpp; + + if (!doingcache) { + cnp->cn_flags &= ~MAKEENTRY; + return (0); + } + if (cnp->cn_namelen > NCHNAMLEN) { + nchstats.ncs_long++; + cnp->cn_flags &= ~MAKEENTRY; + return (0); + } + + ncpp = NCHHASH(dvp, cnp); + for (ncp = ncpp->lh_first; ncp != 0; ncp = nnp) { + nnp = ncp->nc_hash.le_next; + /* If one of the vp's went stale, don't bother anymore. */ + if ((ncp->nc_dvpid != ncp->nc_dvp->v_id) || + (ncp->nc_vp && ncp->nc_vpid != ncp->nc_vp->v_id)) { + nchstats.ncs_falsehits++; + PURGE(ncp); + continue; + } + /* Now that we know the vp's to be valid, is it ours ? */ + if (ncp->nc_dvp == dvp && + ncp->nc_nlen == cnp->cn_namelen && + !bcmp(ncp->nc_name, cnp->cn_nameptr, (u_int)ncp->nc_nlen)) + break; + } + + /* We failed to find an entry */ + if (ncp == 0) { + nchstats.ncs_miss++; + return (0); + } + + /* We don't want to have an entry, so dump it */ + if ((cnp->cn_flags & MAKEENTRY) == 0) { + nchstats.ncs_badhits++; + PURGE(ncp); + return (0); + } + + /* We found a "positive" match, return the vnode */ + if (ncp->nc_vp) { + nchstats.ncs_goodhits++; + TOUCH(ncp); + *vpp = ncp->nc_vp; + return (-1); + } + + /* We found a negative match, and want to create it, so purge */ + if (cnp->cn_nameiop == CREATE) { + nchstats.ncs_badhits++; + PURGE(ncp); + return (0); + } + + /* + * We found a "negative" match, ENOENT notifies client of this match. + * The nc_vpid field records whether this is a whiteout. + */ + nchstats.ncs_neghits++; + TOUCH(ncp); + cnp->cn_flags |= ncp->nc_vpid; + return (ENOENT); +} + +/* + * Add an entry to the cache. + */ +void +cache_enter(dvp, vp, cnp) + struct vnode *dvp; + struct vnode *vp; + struct componentname *cnp; +{ + register struct namecache *ncp; + register struct nchashhead *ncpp; + + if (!doingcache) + return; + +#ifdef DIAGNOSTIC + if (cnp->cn_namelen > NCHNAMLEN) + panic("cache_enter: name too long"); +#endif + + /* + * We allocate a new entry if we are less than the maximum + * allowed and the one at the front of the LRU list is in use. + * Otherwise we use the one at the front of the LRU list. + */ + if (numcache < desiredvnodes && + ((ncp = nclruhead.tqh_first) == NULL || + ncp->nc_hash.le_prev != 0)) { + /* Add one more entry */ + ncp = (struct namecache *) + malloc((u_long)sizeof *ncp, M_CACHE, M_WAITOK); + bzero((char *)ncp, sizeof *ncp); + numcache++; + } else if (ncp = nclruhead.tqh_first) { + /* reuse an old entry */ + TAILQ_REMOVE(&nclruhead, ncp, nc_lru); + if (ncp->nc_hash.le_prev != 0) { + LIST_REMOVE(ncp, nc_hash); + ncp->nc_hash.le_prev = 0; + } + } else { + /* give up */ + return; + } + + /* + * Fill in cache info, if vp is NULL this is a "negative" cache entry. + * For negative entries, we have to record whether it is a whiteout. + * the whiteout flag is stored in the nc_vpid field which is + * otherwise unused. + */ + ncp->nc_vp = vp; + if (vp) + ncp->nc_vpid = vp->v_id; + else + ncp->nc_vpid = cnp->cn_flags & ISWHITEOUT; + ncp->nc_dvp = dvp; + ncp->nc_dvpid = dvp->v_id; + ncp->nc_nlen = cnp->cn_namelen; + bcopy(cnp->cn_nameptr, ncp->nc_name, (unsigned)ncp->nc_nlen); + TAILQ_INSERT_TAIL(&nclruhead, ncp, nc_lru); + ncpp = NCHHASH(dvp, cnp); + LIST_INSERT_HEAD(ncpp, ncp, nc_hash); +} + +/* + * Name cache initialization, from vfs_init() when we are booting + */ +void +nchinit() +{ + + TAILQ_INIT(&nclruhead); + nchashtbl = hashinit(desiredvnodes, M_CACHE, &nchash); +} + +/* + * Invalidate a all entries to particular vnode. + * + * We actually just increment the v_id, that will do it. The entries will + * be purged by lookup as they get found. If the v_id wraps around, we + * need to ditch the entire cache, to avoid confusion. No valid vnode will + * ever have (v_id == 0). + */ +void +cache_purge(vp) + struct vnode *vp; +{ + struct namecache *ncp; + struct nchashhead *ncpp; + + vp->v_id = ++nextvnodeid; + if (nextvnodeid != 0) + return; + for (ncpp = &nchashtbl[nchash]; ncpp >= nchashtbl; ncpp--) { + while (ncp = ncpp->lh_first) + PURGE(ncp); + } + vp->v_id = ++nextvnodeid; +} + +/* + * Flush all entries referencing a particular filesystem. + * + * Since we need to check it anyway, we will flush all the invalid + * entriess at the same time. + */ +void +cache_purgevfs(mp) + struct mount *mp; +{ + struct nchashhead *ncpp; + struct namecache *ncp, *nnp; + + /* Scan hash tables for applicable entries */ + for (ncpp = &nchashtbl[nchash]; ncpp >= nchashtbl; ncpp--) { + for (ncp = ncpp->lh_first; ncp != 0; ncp = nnp) { + nnp = ncp->nc_hash.le_next; + if (ncp->nc_dvpid != ncp->nc_dvp->v_id || + (ncp->nc_vp && ncp->nc_vpid != ncp->nc_vp->v_id) || + ncp->nc_dvp->v_mount == mp) { + PURGE(ncp); + } + } + } +} diff --git a/sys/kern/vfs_cluster.c b/sys/kern/vfs_cluster.c new file mode 100644 index 000000000000..e01d24f099b1 --- /dev/null +++ b/sys/kern/vfs_cluster.c @@ -0,0 +1,756 @@ +/*- + * Copyright (c) 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vfs_cluster.c 8.10 (Berkeley) 3/28/95 + */ + +#include <sys/param.h> +#include <sys/proc.h> +#include <sys/buf.h> +#include <sys/vnode.h> +#include <sys/mount.h> +#include <sys/trace.h> +#include <sys/malloc.h> +#include <sys/resourcevar.h> +#include <libkern/libkern.h> + +/* + * Local declarations + */ +struct buf *cluster_newbuf __P((struct vnode *, struct buf *, long, daddr_t, + daddr_t, long, int)); +struct buf *cluster_rbuild __P((struct vnode *, u_quad_t, struct buf *, + daddr_t, daddr_t, long, int, long)); +void cluster_wbuild __P((struct vnode *, struct buf *, long, + daddr_t, int, daddr_t)); +struct cluster_save *cluster_collectbufs __P((struct vnode *, struct buf *)); + +#ifdef DIAGNOSTIC +/* + * Set to 1 if reads of block zero should cause readahead to be done. + * Set to 0 treats a read of block zero as a non-sequential read. + * + * Setting to one assumes that most reads of block zero of files are due to + * sequential passes over the files (e.g. cat, sum) where additional blocks + * will soon be needed. Setting to zero assumes that the majority are + * surgical strikes to get particular info (e.g. size, file) where readahead + * blocks will not be used and, in fact, push out other potentially useful + * blocks from the cache. The former seems intuitive, but some quick tests + * showed that the latter performed better from a system-wide point of view. + */ +int doclusterraz = 0; +#define ISSEQREAD(vp, blk) \ + (((blk) != 0 || doclusterraz) && \ + ((blk) == (vp)->v_lastr + 1 || (blk) == (vp)->v_lastr)) +#else +#define ISSEQREAD(vp, blk) \ + ((blk) != 0 && ((blk) == (vp)->v_lastr + 1 || (blk) == (vp)->v_lastr)) +#endif + +/* + * This replaces bread. If this is a bread at the beginning of a file and + * lastr is 0, we assume this is the first read and we'll read up to two + * blocks if they are sequential. After that, we'll do regular read ahead + * in clustered chunks. + * + * There are 4 or 5 cases depending on how you count: + * Desired block is in the cache: + * 1 Not sequential access (0 I/Os). + * 2 Access is sequential, do read-ahead (1 ASYNC). + * Desired block is not in cache: + * 3 Not sequential access (1 SYNC). + * 4 Sequential access, next block is contiguous (1 SYNC). + * 5 Sequential access, next block is not contiguous (1 SYNC, 1 ASYNC) + * + * There are potentially two buffers that require I/O. + * bp is the block requested. + * rbp is the read-ahead block. + * If either is NULL, then you don't have to do the I/O. + */ +cluster_read(vp, filesize, lblkno, size, cred, bpp) + struct vnode *vp; + u_quad_t filesize; + daddr_t lblkno; + long size; + struct ucred *cred; + struct buf **bpp; +{ + struct buf *bp, *rbp; + daddr_t blkno, ioblkno; + long flags; + int error, num_ra, alreadyincore; + +#ifdef DIAGNOSTIC + if (size == 0) + panic("cluster_read: size = 0"); +#endif + + error = 0; + flags = B_READ; + *bpp = bp = getblk(vp, lblkno, size, 0, 0); + if (bp->b_flags & B_CACHE) { + /* + * Desired block is in cache; do any readahead ASYNC. + * Case 1, 2. + */ + trace(TR_BREADHIT, pack(vp, size), lblkno); + flags |= B_ASYNC; + ioblkno = lblkno + (vp->v_ralen ? vp->v_ralen : 1); + alreadyincore = incore(vp, ioblkno) != NULL; + bp = NULL; + } else { + /* Block wasn't in cache, case 3, 4, 5. */ + trace(TR_BREADMISS, pack(vp, size), lblkno); + bp->b_flags |= B_READ; + ioblkno = lblkno; + alreadyincore = 0; + curproc->p_stats->p_ru.ru_inblock++; /* XXX */ + } + /* + * XXX + * Replace 1 with a window size based on some permutation of + * maxcontig and rot_delay. This will let you figure out how + * many blocks you should read-ahead (case 2, 4, 5). + * + * If the access isn't sequential, reset the window to 1. + * Note that a read to the same block is considered sequential. + * This catches the case where the file is being read sequentially, + * but at smaller than the filesystem block size. + */ + rbp = NULL; + if (!ISSEQREAD(vp, lblkno)) { + vp->v_ralen = 0; + vp->v_maxra = lblkno; + } else if ((ioblkno + 1) * size <= filesize && !alreadyincore && + !(error = VOP_BMAP(vp, ioblkno, NULL, &blkno, &num_ra)) && + blkno != -1) { + /* + * Reading sequentially, and the next block is not in the + * cache. We are going to try reading ahead. + */ + if (num_ra) { + /* + * If our desired readahead block had been read + * in a previous readahead but is no longer in + * core, then we may be reading ahead too far + * or are not using our readahead very rapidly. + * In this case we scale back the window. + */ + if (!alreadyincore && ioblkno <= vp->v_maxra) + vp->v_ralen = max(vp->v_ralen >> 1, 1); + /* + * There are more sequential blocks than our current + * window allows, scale up. Ideally we want to get + * in sync with the filesystem maxcontig value. + */ + else if (num_ra > vp->v_ralen && lblkno != vp->v_lastr) + vp->v_ralen = vp->v_ralen ? + min(num_ra, vp->v_ralen << 1) : 1; + + if (num_ra > vp->v_ralen) + num_ra = vp->v_ralen; + } + + if (num_ra) /* case 2, 4 */ + rbp = cluster_rbuild(vp, filesize, + bp, ioblkno, blkno, size, num_ra, flags); + else if (ioblkno == lblkno) { + bp->b_blkno = blkno; + /* Case 5: check how many blocks to read ahead */ + ++ioblkno; + if ((ioblkno + 1) * size > filesize || + incore(vp, ioblkno) || (error = VOP_BMAP(vp, + ioblkno, NULL, &blkno, &num_ra)) || blkno == -1) + goto skip_readahead; + /* + * Adjust readahead as above. + * Don't check alreadyincore, we know it is 0 from + * the previous conditional. + */ + if (num_ra) { + if (ioblkno <= vp->v_maxra) + vp->v_ralen = max(vp->v_ralen >> 1, 1); + else if (num_ra > vp->v_ralen && + lblkno != vp->v_lastr) + vp->v_ralen = vp->v_ralen ? + min(num_ra,vp->v_ralen<<1) : 1; + if (num_ra > vp->v_ralen) + num_ra = vp->v_ralen; + } + flags |= B_ASYNC; + if (num_ra) + rbp = cluster_rbuild(vp, filesize, + NULL, ioblkno, blkno, size, num_ra, flags); + else { + rbp = getblk(vp, ioblkno, size, 0, 0); + rbp->b_flags |= flags; + rbp->b_blkno = blkno; + } + } else { + /* case 2; read ahead single block */ + rbp = getblk(vp, ioblkno, size, 0, 0); + rbp->b_flags |= flags; + rbp->b_blkno = blkno; + } + + if (rbp == bp) /* case 4 */ + rbp = NULL; + else if (rbp) { /* case 2, 5 */ + trace(TR_BREADMISSRA, + pack(vp, (num_ra + 1) * size), ioblkno); + curproc->p_stats->p_ru.ru_inblock++; /* XXX */ + } + } + + /* XXX Kirk, do we need to make sure the bp has creds? */ +skip_readahead: + if (bp) + if (bp->b_flags & (B_DONE | B_DELWRI)) + panic("cluster_read: DONE bp"); + else + error = VOP_STRATEGY(bp); + + if (rbp) + if (error || rbp->b_flags & (B_DONE | B_DELWRI)) { + rbp->b_flags &= ~(B_ASYNC | B_READ); + brelse(rbp); + } else + (void) VOP_STRATEGY(rbp); + + /* + * Recalculate our maximum readahead + */ + if (rbp == NULL) + rbp = bp; + if (rbp) + vp->v_maxra = rbp->b_lblkno + (rbp->b_bufsize / size) - 1; + + if (bp) + return(biowait(bp)); + return(error); +} + +/* + * If blocks are contiguous on disk, use this to provide clustered + * read ahead. We will read as many blocks as possible sequentially + * and then parcel them up into logical blocks in the buffer hash table. + */ +struct buf * +cluster_rbuild(vp, filesize, bp, lbn, blkno, size, run, flags) + struct vnode *vp; + u_quad_t filesize; + struct buf *bp; + daddr_t lbn; + daddr_t blkno; + long size; + int run; + long flags; +{ + struct cluster_save *b_save; + struct buf *tbp; + daddr_t bn; + int i, inc; + +#ifdef DIAGNOSTIC + if (size != vp->v_mount->mnt_stat.f_iosize) + panic("cluster_rbuild: size %d != filesize %d\n", + size, vp->v_mount->mnt_stat.f_iosize); +#endif + if (size * (lbn + run + 1) > filesize) + --run; + if (run == 0) { + if (!bp) { + bp = getblk(vp, lbn, size, 0, 0); + bp->b_blkno = blkno; + bp->b_flags |= flags; + } + return(bp); + } + + bp = cluster_newbuf(vp, bp, flags, blkno, lbn, size, run + 1); + if (bp->b_flags & (B_DONE | B_DELWRI)) + return (bp); + + b_save = malloc(sizeof(struct buf *) * run + sizeof(struct cluster_save), + M_SEGMENT, M_WAITOK); + b_save->bs_bufsize = b_save->bs_bcount = size; + b_save->bs_nchildren = 0; + b_save->bs_children = (struct buf **)(b_save + 1); + b_save->bs_saveaddr = bp->b_saveaddr; + bp->b_saveaddr = (caddr_t) b_save; + + inc = btodb(size); + for (bn = blkno + inc, i = 1; i <= run; ++i, bn += inc) { + /* + * A component of the cluster is already in core, + * terminate the cluster early. + */ + if (incore(vp, lbn + i)) + break; + tbp = getblk(vp, lbn + i, 0, 0, 0); + /* + * getblk may return some memory in the buffer if there were + * no empty buffers to shed it to. If there is currently + * memory in the buffer, we move it down size bytes to make + * room for the valid pages that cluster_callback will insert. + * We do this now so we don't have to do it at interrupt time + * in the callback routine. + */ + if (tbp->b_bufsize != 0) { + caddr_t bdata = (char *)tbp->b_data; + + /* + * No room in the buffer to add another page, + * terminate the cluster early. + */ + if (tbp->b_bufsize + size > MAXBSIZE) { +#ifdef DIAGNOSTIC + if (tbp->b_bufsize != MAXBSIZE) + panic("cluster_rbuild: too much memory"); +#endif + brelse(tbp); + break; + } + if (tbp->b_bufsize > size) { + /* + * XXX if the source and destination regions + * overlap we have to copy backward to avoid + * clobbering any valid pages (i.e. pagemove + * implementations typically can't handle + * overlap). + */ + bdata += tbp->b_bufsize; + while (bdata > (char *)tbp->b_data) { + bdata -= CLBYTES; + pagemove(bdata, bdata + size, CLBYTES); + } + } else + pagemove(bdata, bdata + size, tbp->b_bufsize); + } + tbp->b_blkno = bn; + tbp->b_flags |= flags | B_READ | B_ASYNC; + ++b_save->bs_nchildren; + b_save->bs_children[i - 1] = tbp; + } + /* + * The cluster may have been terminated early, adjust the cluster + * buffer size accordingly. If no cluster could be formed, + * deallocate the cluster save info. + */ + if (i <= run) { + if (i == 1) { + bp->b_saveaddr = b_save->bs_saveaddr; + bp->b_flags &= ~B_CALL; + bp->b_iodone = NULL; + free(b_save, M_SEGMENT); + } + allocbuf(bp, size * i); + } + return(bp); +} + +/* + * Either get a new buffer or grow the existing one. + */ +struct buf * +cluster_newbuf(vp, bp, flags, blkno, lblkno, size, run) + struct vnode *vp; + struct buf *bp; + long flags; + daddr_t blkno; + daddr_t lblkno; + long size; + int run; +{ + if (!bp) { + bp = getblk(vp, lblkno, size, 0, 0); + if (bp->b_flags & (B_DONE | B_DELWRI)) { + bp->b_blkno = blkno; + return(bp); + } + } + allocbuf(bp, run * size); + bp->b_blkno = blkno; + bp->b_iodone = cluster_callback; + bp->b_flags |= flags | B_CALL; + return(bp); +} + +/* + * Cleanup after a clustered read or write. + * This is complicated by the fact that any of the buffers might have + * extra memory (if there were no empty buffer headers at allocbuf time) + * that we will need to shift around. + */ +void +cluster_callback(bp) + struct buf *bp; +{ + struct cluster_save *b_save; + struct buf **bpp, *tbp; + long bsize; + caddr_t cp; + int error = 0; + + /* + * Must propogate errors to all the components. + */ + if (bp->b_flags & B_ERROR) + error = bp->b_error; + + b_save = (struct cluster_save *)(bp->b_saveaddr); + bp->b_saveaddr = b_save->bs_saveaddr; + + bsize = b_save->bs_bufsize; + cp = (char *)bp->b_data + bsize; + /* + * Move memory from the large cluster buffer into the component + * buffers and mark IO as done on these. + */ + for (bpp = b_save->bs_children; b_save->bs_nchildren--; ++bpp) { + tbp = *bpp; + pagemove(cp, tbp->b_data, bsize); + tbp->b_bufsize += bsize; + tbp->b_bcount = bsize; + if (error) { + tbp->b_flags |= B_ERROR; + tbp->b_error = error; + } + biodone(tbp); + bp->b_bufsize -= bsize; + cp += bsize; + } + /* + * If there was excess memory in the cluster buffer, + * slide it up adjacent to the remaining valid data. + */ + if (bp->b_bufsize != bsize) { + if (bp->b_bufsize < bsize) + panic("cluster_callback: too little memory"); + pagemove(cp, (char *)bp->b_data + bsize, bp->b_bufsize - bsize); + } + bp->b_bcount = bsize; + bp->b_iodone = NULL; + free(b_save, M_SEGMENT); + if (bp->b_flags & B_ASYNC) + brelse(bp); + else { + bp->b_flags &= ~B_WANTED; + wakeup((caddr_t)bp); + } +} + +/* + * Do clustered write for FFS. + * + * Three cases: + * 1. Write is not sequential (write asynchronously) + * Write is sequential: + * 2. beginning of cluster - begin cluster + * 3. middle of a cluster - add to cluster + * 4. end of a cluster - asynchronously write cluster + */ +void +cluster_write(bp, filesize) + struct buf *bp; + u_quad_t filesize; +{ + struct vnode *vp; + daddr_t lbn; + int maxclen, cursize; + + vp = bp->b_vp; + lbn = bp->b_lblkno; + + /* Initialize vnode to beginning of file. */ + if (lbn == 0) + vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0; + + if (vp->v_clen == 0 || lbn != vp->v_lastw + 1 || + (bp->b_blkno != vp->v_lasta + btodb(bp->b_bcount))) { + maxclen = MAXBSIZE / vp->v_mount->mnt_stat.f_iosize - 1; + if (vp->v_clen != 0) { + /* + * Next block is not sequential. + * + * If we are not writing at end of file, the process + * seeked to another point in the file since its + * last write, or we have reached our maximum + * cluster size, then push the previous cluster. + * Otherwise try reallocating to make it sequential. + */ + cursize = vp->v_lastw - vp->v_cstart + 1; + if ((lbn + 1) * bp->b_bcount != filesize || + lbn != vp->v_lastw + 1 || vp->v_clen <= cursize) { + cluster_wbuild(vp, NULL, bp->b_bcount, + vp->v_cstart, cursize, lbn); + } else { + struct buf **bpp, **endbp; + struct cluster_save *buflist; + + buflist = cluster_collectbufs(vp, bp); + endbp = &buflist->bs_children + [buflist->bs_nchildren - 1]; + if (VOP_REALLOCBLKS(vp, buflist)) { + /* + * Failed, push the previous cluster. + */ + for (bpp = buflist->bs_children; + bpp < endbp; bpp++) + brelse(*bpp); + free(buflist, M_SEGMENT); + cluster_wbuild(vp, NULL, bp->b_bcount, + vp->v_cstart, cursize, lbn); + } else { + /* + * Succeeded, keep building cluster. + */ + for (bpp = buflist->bs_children; + bpp <= endbp; bpp++) + bdwrite(*bpp); + free(buflist, M_SEGMENT); + vp->v_lastw = lbn; + vp->v_lasta = bp->b_blkno; + return; + } + } + } + /* + * Consider beginning a cluster. + * If at end of file, make cluster as large as possible, + * otherwise find size of existing cluster. + */ + if ((lbn + 1) * bp->b_bcount != filesize && + (VOP_BMAP(vp, lbn, NULL, &bp->b_blkno, &maxclen) || + bp->b_blkno == -1)) { + bawrite(bp); + vp->v_clen = 0; + vp->v_lasta = bp->b_blkno; + vp->v_cstart = lbn + 1; + vp->v_lastw = lbn; + return; + } + vp->v_clen = maxclen; + if (maxclen == 0) { /* I/O not contiguous */ + vp->v_cstart = lbn + 1; + bawrite(bp); + } else { /* Wait for rest of cluster */ + vp->v_cstart = lbn; + bdwrite(bp); + } + } else if (lbn == vp->v_cstart + vp->v_clen) { + /* + * At end of cluster, write it out. + */ + cluster_wbuild(vp, bp, bp->b_bcount, vp->v_cstart, + vp->v_clen + 1, lbn); + vp->v_clen = 0; + vp->v_cstart = lbn + 1; + } else + /* + * In the middle of a cluster, so just delay the + * I/O for now. + */ + bdwrite(bp); + vp->v_lastw = lbn; + vp->v_lasta = bp->b_blkno; +} + + +/* + * This is an awful lot like cluster_rbuild...wish they could be combined. + * The last lbn argument is the current block on which I/O is being + * performed. Check to see that it doesn't fall in the middle of + * the current block (if last_bp == NULL). + */ +void +cluster_wbuild(vp, last_bp, size, start_lbn, len, lbn) + struct vnode *vp; + struct buf *last_bp; + long size; + daddr_t start_lbn; + int len; + daddr_t lbn; +{ + struct cluster_save *b_save; + struct buf *bp, *tbp; + caddr_t cp; + int i, s; + +#ifdef DIAGNOSTIC + if (size != vp->v_mount->mnt_stat.f_iosize) + panic("cluster_wbuild: size %d != filesize %d\n", + size, vp->v_mount->mnt_stat.f_iosize); +#endif +redo: + while ((!incore(vp, start_lbn) || start_lbn == lbn) && len) { + ++start_lbn; + --len; + } + + /* Get more memory for current buffer */ + if (len <= 1) { + if (last_bp) { + bawrite(last_bp); + } else if (len) { + bp = getblk(vp, start_lbn, size, 0, 0); + bawrite(bp); + } + return; + } + + bp = getblk(vp, start_lbn, size, 0, 0); + if (!(bp->b_flags & B_DELWRI)) { + ++start_lbn; + --len; + brelse(bp); + goto redo; + } + + /* + * Extra memory in the buffer, punt on this buffer. + * XXX we could handle this in most cases, but we would have to + * push the extra memory down to after our max possible cluster + * size and then potentially pull it back up if the cluster was + * terminated prematurely--too much hassle. + */ + if (bp->b_bcount != bp->b_bufsize) { + ++start_lbn; + --len; + bawrite(bp); + goto redo; + } + + --len; + b_save = malloc(sizeof(struct buf *) * len + sizeof(struct cluster_save), + M_SEGMENT, M_WAITOK); + b_save->bs_bcount = bp->b_bcount; + b_save->bs_bufsize = bp->b_bufsize; + b_save->bs_nchildren = 0; + b_save->bs_children = (struct buf **)(b_save + 1); + b_save->bs_saveaddr = bp->b_saveaddr; + bp->b_saveaddr = (caddr_t) b_save; + + bp->b_flags |= B_CALL; + bp->b_iodone = cluster_callback; + cp = (char *)bp->b_data + size; + for (++start_lbn, i = 0; i < len; ++i, ++start_lbn) { + /* + * Block is not in core or the non-sequential block + * ending our cluster was part of the cluster (in which + * case we don't want to write it twice). + */ + if (!incore(vp, start_lbn) || + last_bp == NULL && start_lbn == lbn) + break; + + /* + * Get the desired block buffer (unless it is the final + * sequential block whose buffer was passed in explictly + * as last_bp). + */ + if (last_bp == NULL || start_lbn != lbn) { + tbp = getblk(vp, start_lbn, size, 0, 0); + if (!(tbp->b_flags & B_DELWRI)) { + brelse(tbp); + break; + } + } else + tbp = last_bp; + + ++b_save->bs_nchildren; + + /* Move memory from children to parent */ + if (tbp->b_blkno != (bp->b_blkno + btodb(bp->b_bufsize))) { + printf("Clustered Block: %d addr %x bufsize: %d\n", + bp->b_lblkno, bp->b_blkno, bp->b_bufsize); + printf("Child Block: %d addr: %x\n", tbp->b_lblkno, + tbp->b_blkno); + panic("Clustered write to wrong blocks"); + } + + pagemove(tbp->b_data, cp, size); + bp->b_bcount += size; + bp->b_bufsize += size; + + tbp->b_bufsize -= size; + tbp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI); + tbp->b_flags |= (B_ASYNC | B_AGE); + s = splbio(); + reassignbuf(tbp, tbp->b_vp); /* put on clean list */ + ++tbp->b_vp->v_numoutput; + splx(s); + b_save->bs_children[i] = tbp; + + cp += size; + } + + if (i == 0) { + /* None to cluster */ + bp->b_saveaddr = b_save->bs_saveaddr; + bp->b_flags &= ~B_CALL; + bp->b_iodone = NULL; + free(b_save, M_SEGMENT); + } + bawrite(bp); + if (i < len) { + len -= i + 1; + start_lbn += 1; + goto redo; + } +} + +/* + * Collect together all the buffers in a cluster. + * Plus add one additional buffer. + */ +struct cluster_save * +cluster_collectbufs(vp, last_bp) + struct vnode *vp; + struct buf *last_bp; +{ + struct cluster_save *buflist; + daddr_t lbn; + int i, len; + + len = vp->v_lastw - vp->v_cstart + 1; + buflist = malloc(sizeof(struct buf *) * (len + 1) + sizeof(*buflist), + M_SEGMENT, M_WAITOK); + buflist->bs_nchildren = 0; + buflist->bs_children = (struct buf **)(buflist + 1); + for (lbn = vp->v_cstart, i = 0; i < len; lbn++, i++) + (void)bread(vp, lbn, last_bp->b_bcount, NOCRED, + &buflist->bs_children[i]); + buflist->bs_children[i] = last_bp; + buflist->bs_nchildren = i + 1; + return (buflist); +} diff --git a/sys/kern/vfs_conf.c b/sys/kern/vfs_conf.c new file mode 100644 index 000000000000..9b5779767d00 --- /dev/null +++ b/sys/kern/vfs_conf.c @@ -0,0 +1,248 @@ +/* + * Copyright (c) 1989, 1993, 1995 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vfs_conf.c 8.11 (Berkeley) 5/10/95 + */ + +#include <sys/param.h> +#include <sys/mount.h> +#include <sys/vnode.h> + +/* + * These define the root filesystem, device, and root filesystem type. + */ +struct mount *rootfs; +struct vnode *rootvnode; +int (*mountroot)() = NULL; + +/* + * Set up the initial array of known filesystem types. + */ +extern struct vfsops ufs_vfsops; +extern int ffs_mountroot(); +extern struct vfsops lfs_vfsops; +extern int lfs_mountroot(); +extern struct vfsops mfs_vfsops; +extern int mfs_mountroot(); +extern struct vfsops cd9660_vfsops; +extern int cd9660_mountroot(); +extern struct vfsops msdos_vfsops; +extern struct vfsops adosfs_vfsops; +extern struct vfsops nfs_vfsops; +extern int nfs_mountroot(); +extern struct vfsops afs_vfsops; +extern struct vfsops procfs_vfsops; +extern struct vfsops null_vfsops; +extern struct vfsops union_vfsops; +extern struct vfsops umap_vfsops; +extern struct vfsops portal_vfsops; +extern struct vfsops fdesc_vfsops; +extern struct vfsops kernfs_vfsops; + +/* + * Set up the filesystem operations for vnodes. + */ +static struct vfsconf vfsconflist[] = { + + /* Fast Filesystem */ +#ifdef FFS + { &ufs_vfsops, "ufs", 1, 0, MNT_LOCAL, ffs_mountroot, NULL }, +#endif + + /* Log-based Filesystem */ +#ifdef LFS + { &lfs_vfsops, "lfs", 5, 0, MNT_LOCAL, lfs_mountroot, NULL }, +#endif + + /* Memory-based Filesystem */ +#ifdef MFS + { &mfs_vfsops, "mfs", 3, 0, MNT_LOCAL, mfs_mountroot, NULL }, +#endif + + /* ISO9660 (aka CDROM) Filesystem */ +#ifdef CD9660 + { &cd9660_vfsops, "cd9660", 14, 0, MNT_LOCAL, cd9660_mountroot, NULL }, +#endif + + /* MSDOS Filesystem */ +#ifdef MSDOS + { &msdos_vfsops, "msdos", 4, 0, MNT_LOCAL, NULL, NULL }, +#endif + + /* AmigaDOS Filesystem */ +#ifdef ADOSFS + { &adosfs_vfsops, "adosfs", 16, 0, MNT_LOCAL, NULL, NULL }, +#endif + + /* Sun-compatible Network Filesystem */ +#ifdef NFS + { &nfs_vfsops, "nfs", 2, 0, 0, nfs_mountroot, NULL }, +#endif + + /* Andrew Filesystem */ +#ifdef AFS + { &afs_vfsops, "andrewfs", 13, 0, 0, afs_mountroot, NULL }, +#endif + + /* /proc Filesystem */ +#ifdef PROCFS + { &procfs_vfsops, "procfs", 12, 0, 0, NULL, NULL }, +#endif + + /* Loopback (Minimal) Filesystem Layer */ +#ifdef NULLFS + { &null_vfsops, "loopback", 9, 0, 0, NULL, NULL }, +#endif + + /* Union (translucent) Filesystem */ +#ifdef UNION + { &union_vfsops, "union", 15, 0, 0, NULL, NULL }, +#endif + + /* User/Group Identifer Remapping Filesystem */ +#ifdef UMAPFS + { &umap_vfsops, "umap", 10, 0, 0, NULL, NULL }, +#endif + + /* Portal Filesystem */ +#ifdef PORTAL + { &portal_vfsops, "portal", 8, 0, 0, NULL, NULL }, +#endif + + /* File Descriptor Filesystem */ +#ifdef FDESC + { &fdesc_vfsops, "fdesc", 7, 0, 0, NULL, NULL }, +#endif + + /* Kernel Information Filesystem */ +#ifdef KERNFS + { &kernfs_vfsops, "kernfs", 11, 0, 0, NULL, NULL }, +#endif + +}; + +/* + * Initially the size of the list, vfs_init will set maxvfsconf + * to the highest defined type number. + */ +int maxvfsconf = sizeof(vfsconflist) / sizeof (struct vfsconf); +struct vfsconf *vfsconf = vfsconflist; + +/* + * + * vfs_opv_descs enumerates the list of vnode classes, each with it's own + * vnode operation vector. It is consulted at system boot to build operation + * vectors. It is NULL terminated. + * + */ +extern struct vnodeopv_desc ffs_vnodeop_opv_desc; +extern struct vnodeopv_desc ffs_specop_opv_desc; +extern struct vnodeopv_desc ffs_fifoop_opv_desc; +extern struct vnodeopv_desc lfs_vnodeop_opv_desc; +extern struct vnodeopv_desc lfs_specop_opv_desc; +extern struct vnodeopv_desc lfs_fifoop_opv_desc; +extern struct vnodeopv_desc mfs_vnodeop_opv_desc; +extern struct vnodeopv_desc dead_vnodeop_opv_desc; +extern struct vnodeopv_desc fifo_vnodeop_opv_desc; +extern struct vnodeopv_desc spec_vnodeop_opv_desc; +extern struct vnodeopv_desc nfsv2_vnodeop_opv_desc; +extern struct vnodeopv_desc spec_nfsv2nodeop_opv_desc; +extern struct vnodeopv_desc fifo_nfsv2nodeop_opv_desc; +extern struct vnodeopv_desc fdesc_vnodeop_opv_desc; +extern struct vnodeopv_desc portal_vnodeop_opv_desc; +extern struct vnodeopv_desc null_vnodeop_opv_desc; +extern struct vnodeopv_desc umap_vnodeop_opv_desc; +extern struct vnodeopv_desc kernfs_vnodeop_opv_desc; +extern struct vnodeopv_desc procfs_vnodeop_opv_desc; +extern struct vnodeopv_desc cd9660_vnodeop_opv_desc; +extern struct vnodeopv_desc cd9660_specop_opv_desc; +extern struct vnodeopv_desc cd9660_fifoop_opv_desc; +extern struct vnodeopv_desc union_vnodeop_opv_desc; + +struct vnodeopv_desc *vfs_opv_descs[] = { + &ffs_vnodeop_opv_desc, + &ffs_specop_opv_desc, +#ifdef FIFO + &ffs_fifoop_opv_desc, +#endif + &dead_vnodeop_opv_desc, +#ifdef FIFO + &fifo_vnodeop_opv_desc, +#endif + &spec_vnodeop_opv_desc, +#ifdef LFS + &lfs_vnodeop_opv_desc, + &lfs_specop_opv_desc, +#ifdef FIFO + &lfs_fifoop_opv_desc, +#endif +#endif +#ifdef MFS + &mfs_vnodeop_opv_desc, +#endif +#ifdef NFS + &nfsv2_vnodeop_opv_desc, + &spec_nfsv2nodeop_opv_desc, +#ifdef FIFO + &fifo_nfsv2nodeop_opv_desc, +#endif +#endif +#ifdef FDESC + &fdesc_vnodeop_opv_desc, +#endif +#ifdef PORTAL + &portal_vnodeop_opv_desc, +#endif +#ifdef NULLFS + &null_vnodeop_opv_desc, +#endif +#ifdef UMAPFS + &umap_vnodeop_opv_desc, +#endif +#ifdef KERNFS + &kernfs_vnodeop_opv_desc, +#endif +#ifdef PROCFS + &procfs_vnodeop_opv_desc, +#endif +#ifdef CD9660 + &cd9660_vnodeop_opv_desc, + &cd9660_specop_opv_desc, +#ifdef FIFO + &cd9660_fifoop_opv_desc, +#endif +#endif +#ifdef UNION + &union_vnodeop_opv_desc, +#endif + NULL +}; diff --git a/sys/kern/vfs_init.c b/sys/kern/vfs_init.c new file mode 100644 index 000000000000..b5abe5801af4 --- /dev/null +++ b/sys/kern/vfs_init.c @@ -0,0 +1,251 @@ +/* + * Copyright (c) 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed + * to Berkeley by John Heidemann of the UCLA Ficus project. + * + * Source: * @(#)i405_init.c 2.10 92/04/27 UCLA Ficus project + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vfs_init.c 8.5 (Berkeley) 5/11/95 + */ + + +#include <sys/param.h> +#include <sys/mount.h> +#include <sys/time.h> +#include <sys/vnode.h> +#include <sys/stat.h> +#include <sys/namei.h> +#include <sys/ucred.h> +#include <sys/buf.h> +#include <sys/errno.h> +#include <sys/malloc.h> + +/* + * Sigh, such primitive tools are these... + */ +#if 0 +#define DODEBUG(A) A +#else +#define DODEBUG(A) +#endif + +extern struct vnodeopv_desc *vfs_opv_descs[]; + /* a list of lists of vnodeops defns */ +extern struct vnodeop_desc *vfs_op_descs[]; + /* and the operations they perform */ +/* + * This code doesn't work if the defn is **vnodop_defns with cc. + * The problem is because of the compiler sometimes putting in an + * extra level of indirection for arrays. It's an interesting + * "feature" of C. + */ +int vfs_opv_numops; + +typedef (*PFI)(); /* the standard Pointer to a Function returning an Int */ + +/* + * A miscellaneous routine. + * A generic "default" routine that just returns an error. + */ +int +vn_default_error() +{ + + return (EOPNOTSUPP); +} + +/* + * vfs_init.c + * + * Allocate and fill in operations vectors. + * + * An undocumented feature of this approach to defining operations is that + * there can be multiple entries in vfs_opv_descs for the same operations + * vector. This allows third parties to extend the set of operations + * supported by another layer in a binary compatibile way. For example, + * assume that NFS needed to be modified to support Ficus. NFS has an entry + * (probably nfs_vnopdeop_decls) declaring all the operations NFS supports by + * default. Ficus could add another entry (ficus_nfs_vnodeop_decl_entensions) + * listing those new operations Ficus adds to NFS, all without modifying the + * NFS code. (Of couse, the OTW NFS protocol still needs to be munged, but + * that is a(whole)nother story.) This is a feature. + */ +void +vfs_opv_init() +{ + int i, j, k; + int (***opv_desc_vector_p)(); + int (**opv_desc_vector)(); + struct vnodeopv_entry_desc *opve_descp; + + /* + * Allocate the dynamic vectors and fill them in. + */ + for (i=0; vfs_opv_descs[i]; i++) { + opv_desc_vector_p = vfs_opv_descs[i]->opv_desc_vector_p; + /* + * Allocate and init the vector, if it needs it. + * Also handle backwards compatibility. + */ + if (*opv_desc_vector_p == NULL) { + /* XXX - shouldn't be M_VNODE */ + MALLOC(*opv_desc_vector_p, PFI*, + vfs_opv_numops*sizeof(PFI), M_VNODE, M_WAITOK); + bzero (*opv_desc_vector_p, vfs_opv_numops*sizeof(PFI)); + DODEBUG(printf("vector at %x allocated\n", + opv_desc_vector_p)); + } + opv_desc_vector = *opv_desc_vector_p; + for (j=0; vfs_opv_descs[i]->opv_desc_ops[j].opve_op; j++) { + opve_descp = &(vfs_opv_descs[i]->opv_desc_ops[j]); + + /* + * Sanity check: is this operation listed + * in the list of operations? We check this + * by seeing if its offest is zero. Since + * the default routine should always be listed + * first, it should be the only one with a zero + * offset. Any other operation with a zero + * offset is probably not listed in + * vfs_op_descs, and so is probably an error. + * + * A panic here means the layer programmer + * has committed the all-too common bug + * of adding a new operation to the layer's + * list of vnode operations but + * not adding the operation to the system-wide + * list of supported operations. + */ + if (opve_descp->opve_op->vdesc_offset == 0 && + opve_descp->opve_op->vdesc_offset != + VOFFSET(vop_default)) { + printf("operation %s not listed in %s.\n", + opve_descp->opve_op->vdesc_name, + "vfs_op_descs"); + panic ("vfs_opv_init: bad operation"); + } + /* + * Fill in this entry. + */ + opv_desc_vector[opve_descp->opve_op->vdesc_offset] = + opve_descp->opve_impl; + } + } + /* + * Finally, go back and replace unfilled routines + * with their default. (Sigh, an O(n^3) algorithm. I + * could make it better, but that'd be work, and n is small.) + */ + for (i = 0; vfs_opv_descs[i]; i++) { + opv_desc_vector = *(vfs_opv_descs[i]->opv_desc_vector_p); + /* + * Force every operations vector to have a default routine. + */ + if (opv_desc_vector[VOFFSET(vop_default)]==NULL) { + panic("vfs_opv_init: operation vector without default routine."); + } + for (k = 0; k<vfs_opv_numops; k++) + if (opv_desc_vector[k] == NULL) + opv_desc_vector[k] = + opv_desc_vector[VOFFSET(vop_default)]; + } +} + +/* + * Initialize known vnode operations vectors. + */ +void +vfs_op_init() +{ + int i; + + DODEBUG(printf("Vnode_interface_init.\n")); + /* + * Set all vnode vectors to a well known value. + */ + for (i = 0; vfs_opv_descs[i]; i++) + *(vfs_opv_descs[i]->opv_desc_vector_p) = NULL; + /* + * Figure out how many ops there are by counting the table, + * and assign each its offset. + */ + for (vfs_opv_numops = 0, i = 0; vfs_op_descs[i]; i++) { + vfs_op_descs[i]->vdesc_offset = vfs_opv_numops; + vfs_opv_numops++; + } + DODEBUG(printf ("vfs_opv_numops=%d\n", vfs_opv_numops)); +} + +/* + * Routines having to do with the management of the vnode table. + */ +extern struct vnodeops dead_vnodeops; +extern struct vnodeops spec_vnodeops; +struct vattr va_null; + +/* + * Initialize the vnode structures and initialize each file system type. + */ +vfsinit() +{ + struct vfsconf *vfsp; + int i, maxtypenum; + + /* + * Initialize the vnode table + */ + vntblinit(); + /* + * Initialize the vnode name cache + */ + nchinit(); + /* + * Build vnode operation vectors. + */ + vfs_op_init(); + vfs_opv_init(); /* finish the job */ + /* + * Initialize each file system type. + */ + vattr_null(&va_null); + maxtypenum = 0; + for (vfsp = vfsconf, i = 1; i <= maxvfsconf; i++, vfsp++) { + if (i < maxvfsconf) + vfsp->vfc_next = vfsp + 1; + if (maxtypenum <= vfsp->vfc_typenum) + maxtypenum = vfsp->vfc_typenum + 1; + (*vfsp->vfc_vfsops->vfs_init)(vfsp); + } + /* next vfc_typenum to be used */ + maxvfsconf = maxtypenum; +} diff --git a/sys/kern/vfs_lookup.c b/sys/kern/vfs_lookup.c new file mode 100644 index 000000000000..826fbfeab83b --- /dev/null +++ b/sys/kern/vfs_lookup.c @@ -0,0 +1,645 @@ +/* + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vfs_lookup.c 8.10 (Berkeley) 5/27/95 + */ + +#include <sys/param.h> +#include <sys/syslimits.h> +#include <sys/time.h> +#include <sys/namei.h> +#include <sys/vnode.h> +#include <sys/mount.h> +#include <sys/errno.h> +#include <sys/malloc.h> +#include <sys/filedesc.h> +#include <sys/proc.h> + +#ifdef KTRACE +#include <sys/ktrace.h> +#endif + +/* + * Convert a pathname into a pointer to a locked inode. + * + * The FOLLOW flag is set when symbolic links are to be followed + * when they occur at the end of the name translation process. + * Symbolic links are always followed for all other pathname + * components other than the last. + * + * The segflg defines whether the name is to be copied from user + * space or kernel space. + * + * Overall outline of namei: + * + * copy in name + * get starting directory + * while (!done && !error) { + * call lookup to search path. + * if symbolic link, massage name in buffer and continue + * } + */ +int +namei(ndp) + register struct nameidata *ndp; +{ + register struct filedesc *fdp; /* pointer to file descriptor state */ + register char *cp; /* pointer into pathname argument */ + register struct vnode *dp; /* the directory we are searching */ + struct iovec aiov; /* uio for reading symbolic links */ + struct uio auio; + int error, linklen; + struct componentname *cnp = &ndp->ni_cnd; + struct proc *p = cnp->cn_proc; + + ndp->ni_cnd.cn_cred = ndp->ni_cnd.cn_proc->p_ucred; +#ifdef DIAGNOSTIC + if (!cnp->cn_cred || !cnp->cn_proc) + panic ("namei: bad cred/proc"); + if (cnp->cn_nameiop & (~OPMASK)) + panic ("namei: nameiop contaminated with flags"); + if (cnp->cn_flags & OPMASK) + panic ("namei: flags contaminated with nameiops"); +#endif + fdp = cnp->cn_proc->p_fd; + + /* + * Get a buffer for the name to be translated, and copy the + * name into the buffer. + */ + if ((cnp->cn_flags & HASBUF) == 0) + MALLOC(cnp->cn_pnbuf, caddr_t, MAXPATHLEN, M_NAMEI, M_WAITOK); + if (ndp->ni_segflg == UIO_SYSSPACE) + error = copystr(ndp->ni_dirp, cnp->cn_pnbuf, + MAXPATHLEN, &ndp->ni_pathlen); + else + error = copyinstr(ndp->ni_dirp, cnp->cn_pnbuf, + MAXPATHLEN, &ndp->ni_pathlen); + if (error) { + free(cnp->cn_pnbuf, M_NAMEI); + ndp->ni_vp = NULL; + return (error); + } + ndp->ni_loopcnt = 0; +#ifdef KTRACE + if (KTRPOINT(cnp->cn_proc, KTR_NAMEI)) + ktrnamei(cnp->cn_proc->p_tracep, cnp->cn_pnbuf); +#endif + + /* + * Get starting point for the translation. + */ + if ((ndp->ni_rootdir = fdp->fd_rdir) == NULL) + ndp->ni_rootdir = rootvnode; + dp = fdp->fd_cdir; + VREF(dp); + for (;;) { + /* + * Check if root directory should replace current directory. + * Done at start of translation and after symbolic link. + */ + cnp->cn_nameptr = cnp->cn_pnbuf; + if (*(cnp->cn_nameptr) == '/') { + vrele(dp); + while (*(cnp->cn_nameptr) == '/') { + cnp->cn_nameptr++; + ndp->ni_pathlen--; + } + dp = ndp->ni_rootdir; + VREF(dp); + } + ndp->ni_startdir = dp; + if (error = lookup(ndp)) { + FREE(cnp->cn_pnbuf, M_NAMEI); + return (error); + } + /* + * Check for symbolic link + */ + if ((cnp->cn_flags & ISSYMLINK) == 0) { + if ((cnp->cn_flags & (SAVENAME | SAVESTART)) == 0) + FREE(cnp->cn_pnbuf, M_NAMEI); + else + cnp->cn_flags |= HASBUF; + return (0); + } + if ((cnp->cn_flags & LOCKPARENT) && ndp->ni_pathlen == 1) + VOP_UNLOCK(ndp->ni_dvp, 0, p); + if (ndp->ni_loopcnt++ >= MAXSYMLINKS) { + error = ELOOP; + break; + } + if (ndp->ni_pathlen > 1) + MALLOC(cp, char *, MAXPATHLEN, M_NAMEI, M_WAITOK); + else + cp = cnp->cn_pnbuf; + aiov.iov_base = cp; + aiov.iov_len = MAXPATHLEN; + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_offset = 0; + auio.uio_rw = UIO_READ; + auio.uio_segflg = UIO_SYSSPACE; + auio.uio_procp = (struct proc *)0; + auio.uio_resid = MAXPATHLEN; + if (error = VOP_READLINK(ndp->ni_vp, &auio, cnp->cn_cred)) { + if (ndp->ni_pathlen > 1) + free(cp, M_NAMEI); + break; + } + linklen = MAXPATHLEN - auio.uio_resid; + if (linklen + ndp->ni_pathlen >= MAXPATHLEN) { + if (ndp->ni_pathlen > 1) + free(cp, M_NAMEI); + error = ENAMETOOLONG; + break; + } + if (ndp->ni_pathlen > 1) { + bcopy(ndp->ni_next, cp + linklen, ndp->ni_pathlen); + FREE(cnp->cn_pnbuf, M_NAMEI); + cnp->cn_pnbuf = cp; + } else + cnp->cn_pnbuf[linklen] = '\0'; + ndp->ni_pathlen += linklen; + vput(ndp->ni_vp); + dp = ndp->ni_dvp; + } + FREE(cnp->cn_pnbuf, M_NAMEI); + vrele(ndp->ni_dvp); + vput(ndp->ni_vp); + ndp->ni_vp = NULL; + return (error); +} + +/* + * Search a pathname. + * This is a very central and rather complicated routine. + * + * The pathname is pointed to by ni_ptr and is of length ni_pathlen. + * The starting directory is taken from ni_startdir. The pathname is + * descended until done, or a symbolic link is encountered. The variable + * ni_more is clear if the path is completed; it is set to one if a + * symbolic link needing interpretation is encountered. + * + * The flag argument is LOOKUP, CREATE, RENAME, or DELETE depending on + * whether the name is to be looked up, created, renamed, or deleted. + * When CREATE, RENAME, or DELETE is specified, information usable in + * creating, renaming, or deleting a directory entry may be calculated. + * If flag has LOCKPARENT or'ed into it, the parent directory is returned + * locked. If flag has WANTPARENT or'ed into it, the parent directory is + * returned unlocked. Otherwise the parent directory is not returned. If + * the target of the pathname exists and LOCKLEAF is or'ed into the flag + * the target is returned locked, otherwise it is returned unlocked. + * When creating or renaming and LOCKPARENT is specified, the target may not + * be ".". When deleting and LOCKPARENT is specified, the target may be ".". + * + * Overall outline of lookup: + * + * dirloop: + * identify next component of name at ndp->ni_ptr + * handle degenerate case where name is null string + * if .. and crossing mount points and on mounted filesys, find parent + * call VOP_LOOKUP routine for next component name + * directory vnode returned in ni_dvp, unlocked unless LOCKPARENT set + * component vnode returned in ni_vp (if it exists), locked. + * if result vnode is mounted on and crossing mount points, + * find mounted on vnode + * if more components of name, do next level at dirloop + * return the answer in ni_vp, locked if LOCKLEAF set + * if LOCKPARENT set, return locked parent in ni_dvp + * if WANTPARENT set, return unlocked parent in ni_dvp + */ +int +lookup(ndp) + register struct nameidata *ndp; +{ + register char *cp; /* pointer into pathname argument */ + register struct vnode *dp = 0; /* the directory we are searching */ + struct vnode *tdp; /* saved dp */ + struct mount *mp; /* mount table entry */ + int docache; /* == 0 do not cache last component */ + int wantparent; /* 1 => wantparent or lockparent flag */ + int rdonly; /* lookup read-only flag bit */ + int error = 0; + struct componentname *cnp = &ndp->ni_cnd; + struct proc *p = cnp->cn_proc; + + /* + * Setup: break out flag bits into variables. + */ + wantparent = cnp->cn_flags & (LOCKPARENT | WANTPARENT); + docache = (cnp->cn_flags & NOCACHE) ^ NOCACHE; + if (cnp->cn_nameiop == DELETE || + (wantparent && cnp->cn_nameiop != CREATE)) + docache = 0; + rdonly = cnp->cn_flags & RDONLY; + ndp->ni_dvp = NULL; + cnp->cn_flags &= ~ISSYMLINK; + dp = ndp->ni_startdir; + ndp->ni_startdir = NULLVP; + vn_lock(dp, LK_EXCLUSIVE | LK_RETRY, p); + +dirloop: + /* + * Search a new directory. + * + * The cn_hash value is for use by vfs_cache. + * The last component of the filename is left accessible via + * cnp->cn_nameptr for callers that need the name. Callers needing + * the name set the SAVENAME flag. When done, they assume + * responsibility for freeing the pathname buffer. + */ + cnp->cn_consume = 0; + cnp->cn_hash = 0; + for (cp = cnp->cn_nameptr; *cp != 0 && *cp != '/'; cp++) + cnp->cn_hash += (unsigned char)*cp; + cnp->cn_namelen = cp - cnp->cn_nameptr; + if (cnp->cn_namelen > NAME_MAX) { + error = ENAMETOOLONG; + goto bad; + } +#ifdef NAMEI_DIAGNOSTIC + { char c = *cp; + *cp = '\0'; + printf("{%s}: ", cnp->cn_nameptr); + *cp = c; } +#endif + ndp->ni_pathlen -= cnp->cn_namelen; + ndp->ni_next = cp; + cnp->cn_flags |= MAKEENTRY; + if (*cp == '\0' && docache == 0) + cnp->cn_flags &= ~MAKEENTRY; + if (cnp->cn_namelen == 2 && + cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.') + cnp->cn_flags |= ISDOTDOT; + else + cnp->cn_flags &= ~ISDOTDOT; + if (*ndp->ni_next == 0) + cnp->cn_flags |= ISLASTCN; + else + cnp->cn_flags &= ~ISLASTCN; + + + /* + * Check for degenerate name (e.g. / or "") + * which is a way of talking about a directory, + * e.g. like "/." or ".". + */ + if (cnp->cn_nameptr[0] == '\0') { + if (dp->v_type != VDIR) { + error = ENOTDIR; + goto bad; + } + if (cnp->cn_nameiop != LOOKUP) { + error = EISDIR; + goto bad; + } + if (wantparent) { + ndp->ni_dvp = dp; + VREF(dp); + } + ndp->ni_vp = dp; + if (!(cnp->cn_flags & (LOCKPARENT | LOCKLEAF))) + VOP_UNLOCK(dp, 0, p); + if (cnp->cn_flags & SAVESTART) + panic("lookup: SAVESTART"); + return (0); + } + + /* + * Handle "..": two special cases. + * 1. If at root directory (e.g. after chroot) + * or at absolute root directory + * then ignore it so can't get out. + * 2. If this vnode is the root of a mounted + * filesystem, then replace it with the + * vnode which was mounted on so we take the + * .. in the other file system. + */ + if (cnp->cn_flags & ISDOTDOT) { + for (;;) { + if (dp == ndp->ni_rootdir || dp == rootvnode) { + ndp->ni_dvp = dp; + ndp->ni_vp = dp; + VREF(dp); + goto nextname; + } + if ((dp->v_flag & VROOT) == 0 || + (cnp->cn_flags & NOCROSSMOUNT)) + break; + tdp = dp; + dp = dp->v_mount->mnt_vnodecovered; + vput(tdp); + VREF(dp); + vn_lock(dp, LK_EXCLUSIVE | LK_RETRY, p); + } + } + + /* + * We now have a segment name to search for, and a directory to search. + */ +unionlookup: + ndp->ni_dvp = dp; + ndp->ni_vp = NULL; + if (error = VOP_LOOKUP(dp, &ndp->ni_vp, cnp)) { +#ifdef DIAGNOSTIC + if (ndp->ni_vp != NULL) + panic("leaf should be empty"); +#endif +#ifdef NAMEI_DIAGNOSTIC + printf("not found\n"); +#endif + if ((error == ENOENT) && + (dp->v_flag & VROOT) && + (dp->v_mount->mnt_flag & MNT_UNION)) { + tdp = dp; + dp = dp->v_mount->mnt_vnodecovered; + vput(tdp); + VREF(dp); + vn_lock(dp, LK_EXCLUSIVE | LK_RETRY, p); + goto unionlookup; + } + + if (error != EJUSTRETURN) + goto bad; + /* + * If creating and at end of pathname, then can consider + * allowing file to be created. + */ + if (rdonly) { + error = EROFS; + goto bad; + } + /* + * We return with ni_vp NULL to indicate that the entry + * doesn't currently exist, leaving a pointer to the + * (possibly locked) directory inode in ndp->ni_dvp. + */ + if (cnp->cn_flags & SAVESTART) { + ndp->ni_startdir = ndp->ni_dvp; + VREF(ndp->ni_startdir); + } + return (0); + } +#ifdef NAMEI_DIAGNOSTIC + printf("found\n"); +#endif + + /* + * Take into account any additional components consumed by + * the underlying filesystem. + */ + if (cnp->cn_consume > 0) { + cnp->cn_nameptr += cnp->cn_consume; + ndp->ni_next += cnp->cn_consume; + ndp->ni_pathlen -= cnp->cn_consume; + cnp->cn_consume = 0; + } + + dp = ndp->ni_vp; + /* + * Check to see if the vnode has been mounted on; + * if so find the root of the mounted file system. + */ + while (dp->v_type == VDIR && (mp = dp->v_mountedhere) && + (cnp->cn_flags & NOCROSSMOUNT) == 0) { + if (vfs_busy(mp, 0, 0, p)) + continue; + error = VFS_ROOT(mp, &tdp); + vfs_unbusy(mp, p); + if (error) + goto bad2; + vput(dp); + ndp->ni_vp = dp = tdp; + } + + /* + * Check for symbolic link + */ + if ((dp->v_type == VLNK) && + ((cnp->cn_flags & FOLLOW) || *ndp->ni_next == '/')) { + cnp->cn_flags |= ISSYMLINK; + return (0); + } + +nextname: + /* + * Not a symbolic link. If more pathname, + * continue at next component, else return. + */ + if (*ndp->ni_next == '/') { + cnp->cn_nameptr = ndp->ni_next; + while (*cnp->cn_nameptr == '/') { + cnp->cn_nameptr++; + ndp->ni_pathlen--; + } + vrele(ndp->ni_dvp); + goto dirloop; + } + /* + * Disallow directory write attempts on read-only file systems. + */ + if (rdonly && + (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) { + error = EROFS; + goto bad2; + } + if (cnp->cn_flags & SAVESTART) { + ndp->ni_startdir = ndp->ni_dvp; + VREF(ndp->ni_startdir); + } + if (!wantparent) + vrele(ndp->ni_dvp); + if ((cnp->cn_flags & LOCKLEAF) == 0) + VOP_UNLOCK(dp, 0, p); + return (0); + +bad2: + if ((cnp->cn_flags & LOCKPARENT) && *ndp->ni_next == '\0') + VOP_UNLOCK(ndp->ni_dvp, 0, p); + vrele(ndp->ni_dvp); +bad: + vput(dp); + ndp->ni_vp = NULL; + return (error); +} + +/* + * relookup - lookup a path name component + * Used by lookup to re-aquire things. + */ +int +relookup(dvp, vpp, cnp) + struct vnode *dvp, **vpp; + struct componentname *cnp; +{ + struct proc *p = cnp->cn_proc; + struct vnode *dp = 0; /* the directory we are searching */ + int docache; /* == 0 do not cache last component */ + int wantparent; /* 1 => wantparent or lockparent flag */ + int rdonly; /* lookup read-only flag bit */ + int error = 0; +#ifdef NAMEI_DIAGNOSTIC + int newhash; /* DEBUG: check name hash */ + char *cp; /* DEBUG: check name ptr/len */ +#endif + + /* + * Setup: break out flag bits into variables. + */ + wantparent = cnp->cn_flags & (LOCKPARENT|WANTPARENT); + docache = (cnp->cn_flags & NOCACHE) ^ NOCACHE; + if (cnp->cn_nameiop == DELETE || + (wantparent && cnp->cn_nameiop != CREATE)) + docache = 0; + rdonly = cnp->cn_flags & RDONLY; + cnp->cn_flags &= ~ISSYMLINK; + dp = dvp; + vn_lock(dp, LK_EXCLUSIVE | LK_RETRY, p); + +/* dirloop: */ + /* + * Search a new directory. + * + * The cn_hash value is for use by vfs_cache. + * The last component of the filename is left accessible via + * cnp->cn_nameptr for callers that need the name. Callers needing + * the name set the SAVENAME flag. When done, they assume + * responsibility for freeing the pathname buffer. + */ +#ifdef NAMEI_DIAGNOSTIC + for (newhash = 0, cp = cnp->cn_nameptr; *cp != 0 && *cp != '/'; cp++) + newhash += (unsigned char)*cp; + if (newhash != cnp->cn_hash) + panic("relookup: bad hash"); + if (cnp->cn_namelen != cp - cnp->cn_nameptr) + panic ("relookup: bad len"); + if (*cp != 0) + panic("relookup: not last component"); + printf("{%s}: ", cnp->cn_nameptr); +#endif + + /* + * Check for degenerate name (e.g. / or "") + * which is a way of talking about a directory, + * e.g. like "/." or ".". + */ + if (cnp->cn_nameptr[0] == '\0') { + if (cnp->cn_nameiop != LOOKUP || wantparent) { + error = EISDIR; + goto bad; + } + if (dp->v_type != VDIR) { + error = ENOTDIR; + goto bad; + } + if (!(cnp->cn_flags & LOCKLEAF)) + VOP_UNLOCK(dp, 0, p); + *vpp = dp; + if (cnp->cn_flags & SAVESTART) + panic("lookup: SAVESTART"); + return (0); + } + + if (cnp->cn_flags & ISDOTDOT) + panic ("relookup: lookup on dot-dot"); + + /* + * We now have a segment name to search for, and a directory to search. + */ + if (error = VOP_LOOKUP(dp, vpp, cnp)) { +#ifdef DIAGNOSTIC + if (*vpp != NULL) + panic("leaf should be empty"); +#endif + if (error != EJUSTRETURN) + goto bad; + /* + * If creating and at end of pathname, then can consider + * allowing file to be created. + */ + if (rdonly) { + error = EROFS; + goto bad; + } + /* ASSERT(dvp == ndp->ni_startdir) */ + if (cnp->cn_flags & SAVESTART) + VREF(dvp); + /* + * We return with ni_vp NULL to indicate that the entry + * doesn't currently exist, leaving a pointer to the + * (possibly locked) directory inode in ndp->ni_dvp. + */ + return (0); + } + dp = *vpp; + +#ifdef DIAGNOSTIC + /* + * Check for symbolic link + */ + if (dp->v_type == VLNK && (cnp->cn_flags & FOLLOW)) + panic ("relookup: symlink found.\n"); +#endif + + /* + * Disallow directory write attempts on read-only file systems. + */ + if (rdonly && + (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) { + error = EROFS; + goto bad2; + } + /* ASSERT(dvp == ndp->ni_startdir) */ + if (cnp->cn_flags & SAVESTART) + VREF(dvp); + + if (!wantparent) + vrele(dvp); + if ((cnp->cn_flags & LOCKLEAF) == 0) + VOP_UNLOCK(dp, 0, p); + return (0); + +bad2: + if ((cnp->cn_flags & LOCKPARENT) && (cnp->cn_flags & ISLASTCN)) + VOP_UNLOCK(dvp, 0, p); + vrele(dvp); +bad: + vput(dp); + *vpp = NULL; + return (error); +} diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c new file mode 100644 index 000000000000..f891e02d519e --- /dev/null +++ b/sys/kern/vfs_subr.c @@ -0,0 +1,1782 @@ +/* + * Copyright (c) 1989, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95 + */ + +/* + * External virtual filesystem routines + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/proc.h> +#include <sys/mount.h> +#include <sys/time.h> +#include <sys/vnode.h> +#include <sys/stat.h> +#include <sys/namei.h> +#include <sys/ucred.h> +#include <sys/buf.h> +#include <sys/errno.h> +#include <sys/malloc.h> +#include <sys/domain.h> +#include <sys/mbuf.h> + +#include <vm/vm.h> +#include <sys/sysctl.h> + +#include <miscfs/specfs/specdev.h> + +enum vtype iftovt_tab[16] = { + VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, + VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD, +}; +int vttoif_tab[9] = { + 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, + S_IFSOCK, S_IFIFO, S_IFMT, +}; + +/* + * Insq/Remq for the vnode usage lists. + */ +#define bufinsvn(bp, dp) LIST_INSERT_HEAD(dp, bp, b_vnbufs) +#define bufremvn(bp) { \ + LIST_REMOVE(bp, b_vnbufs); \ + (bp)->b_vnbufs.le_next = NOLIST; \ +} +TAILQ_HEAD(freelst, vnode) vnode_free_list; /* vnode free list */ +struct mntlist mountlist; /* mounted filesystem list */ +struct simplelock mountlist_slock; +static struct simplelock mntid_slock; +struct simplelock mntvnode_slock; +struct simplelock vnode_free_list_slock; +static struct simplelock spechash_slock; + +/* + * Initialize the vnode management data structures. + */ +void +vntblinit() +{ + + simple_lock_init(&mntvnode_slock); + simple_lock_init(&mntid_slock); + simple_lock_init(&spechash_slock); + TAILQ_INIT(&vnode_free_list); + simple_lock_init(&vnode_free_list_slock); + CIRCLEQ_INIT(&mountlist); +} + +/* + * Mark a mount point as busy. Used to synchronize access and to delay + * unmounting. Interlock is not released on failure. + */ +int +vfs_busy(mp, flags, interlkp, p) + struct mount *mp; + int flags; + struct simplelock *interlkp; + struct proc *p; +{ + int lkflags; + + if (mp->mnt_flag & MNT_UNMOUNT) { + if (flags & LK_NOWAIT) + return (ENOENT); + mp->mnt_flag |= MNT_MWAIT; + if (interlkp) + simple_unlock(interlkp); + /* + * Since all busy locks are shared except the exclusive + * lock granted when unmounting, the only place that a + * wakeup needs to be done is at the release of the + * exclusive lock at the end of dounmount. + */ + sleep((caddr_t)mp, PVFS); + if (interlkp) + simple_lock(interlkp); + return (ENOENT); + } + lkflags = LK_SHARED; + if (interlkp) + lkflags |= LK_INTERLOCK; + if (lockmgr(&mp->mnt_lock, lkflags, interlkp, p)) + panic("vfs_busy: unexpected lock failure"); + return (0); +} + +/* + * Free a busy filesystem. + */ +void +vfs_unbusy(mp, p) + struct mount *mp; + struct proc *p; +{ + + lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, p); +} + +/* + * Lookup a filesystem type, and if found allocate and initialize + * a mount structure for it. + * + * Devname is usually updated by mount(8) after booting. + */ +int +vfs_rootmountalloc(fstypename, devname, mpp) + char *fstypename; + char *devname; + struct mount **mpp; +{ + struct proc *p = curproc; /* XXX */ + struct vfsconf *vfsp; + struct mount *mp; + + for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) + if (!strcmp(vfsp->vfc_name, fstypename)) + break; + if (vfsp == NULL) + return (ENODEV); + mp = malloc((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK); + bzero((char *)mp, (u_long)sizeof(struct mount)); + lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, 0); + (void)vfs_busy(mp, LK_NOWAIT, 0, p); + LIST_INIT(&mp->mnt_vnodelist); + mp->mnt_vfc = vfsp; + mp->mnt_op = vfsp->vfc_vfsops; + mp->mnt_flag = MNT_RDONLY; + mp->mnt_vnodecovered = NULLVP; + vfsp->vfc_refcount++; + mp->mnt_stat.f_type = vfsp->vfc_typenum; + mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK; + strncpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN); + mp->mnt_stat.f_mntonname[0] = '/'; + (void) copystr(devname, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, 0); + *mpp = mp; + return (0); +} + +/* + * Find an appropriate filesystem to use for the root. If a filesystem + * has not been preselected, walk through the list of known filesystems + * trying those that have mountroot routines, and try them until one + * works or we have tried them all. + */ +int +vfs_mountroot() +{ + struct vfsconf *vfsp; + extern int (*mountroot)(void); + int error; + + if (mountroot != NULL) + return ((*mountroot)()); + for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) { + if (vfsp->vfc_mountroot == NULL) + continue; + if ((error = (*vfsp->vfc_mountroot)()) == 0) + return (0); + printf("%s_mountroot failed: %d\n", vfsp->vfc_name, error); + } + return (ENODEV); +} + +/* + * Lookup a mount point by filesystem identifier. + */ +struct mount * +vfs_getvfs(fsid) + fsid_t *fsid; +{ + register struct mount *mp; + + simple_lock(&mountlist_slock); + for (mp = mountlist.cqh_first; mp != (void *)&mountlist; + mp = mp->mnt_list.cqe_next) { + if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] && + mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) { + simple_unlock(&mountlist_slock); + return (mp); + } + } + simple_unlock(&mountlist_slock); + return ((struct mount *)0); +} + +/* + * Get a new unique fsid + */ +void +vfs_getnewfsid(mp) + struct mount *mp; +{ +static u_short xxxfs_mntid; + + fsid_t tfsid; + int mtype; + + simple_lock(&mntid_slock); + mtype = mp->mnt_vfc->vfc_typenum; + mp->mnt_stat.f_fsid.val[0] = makedev(nblkdev + mtype, 0); + mp->mnt_stat.f_fsid.val[1] = mtype; + if (xxxfs_mntid == 0) + ++xxxfs_mntid; + tfsid.val[0] = makedev(nblkdev + mtype, xxxfs_mntid); + tfsid.val[1] = mtype; + if (mountlist.cqh_first != (void *)&mountlist) { + while (vfs_getvfs(&tfsid)) { + tfsid.val[0]++; + xxxfs_mntid++; + } + } + mp->mnt_stat.f_fsid.val[0] = tfsid.val[0]; + simple_unlock(&mntid_slock); +} + +/* + * Set vnode attributes to VNOVAL + */ +void +vattr_null(vap) + register struct vattr *vap; +{ + + vap->va_type = VNON; + vap->va_size = vap->va_bytes = VNOVAL; + vap->va_mode = vap->va_nlink = vap->va_uid = vap->va_gid = + vap->va_fsid = vap->va_fileid = + vap->va_blocksize = vap->va_rdev = + vap->va_atime.ts_sec = vap->va_atime.ts_nsec = + vap->va_mtime.ts_sec = vap->va_mtime.ts_nsec = + vap->va_ctime.ts_sec = vap->va_ctime.ts_nsec = + vap->va_flags = vap->va_gen = VNOVAL; + vap->va_vaflags = 0; +} + +/* + * Routines having to do with the management of the vnode table. + */ +extern int (**dead_vnodeop_p)(); +static void vclean __P((struct vnode *vp, int flag, struct proc *p)); +extern void vgonel __P((struct vnode *vp, struct proc *p)); +long numvnodes; +extern struct vattr va_null; + +/* + * Return the next vnode from the free list. + */ +int +getnewvnode(tag, mp, vops, vpp) + enum vtagtype tag; + struct mount *mp; + int (**vops)(); + struct vnode **vpp; +{ + struct proc *p = curproc; /* XXX */ + struct vnode *vp; + int s; + int cnt; + +top: + simple_lock(&vnode_free_list_slock); + if ((vnode_free_list.tqh_first == NULL && + numvnodes < 2 * desiredvnodes) || + numvnodes < desiredvnodes) { + simple_unlock(&vnode_free_list_slock); + vp = (struct vnode *)malloc((u_long)sizeof *vp, + M_VNODE, M_WAITOK); + bzero((char *)vp, sizeof *vp); + numvnodes++; + } else { + for (vp = vnode_free_list.tqh_first; + vp != NULLVP; vp = vp->v_freelist.tqe_next) { + if (simple_lock_try(&vp->v_interlock)) + break; + } + /* + * Unless this is a bad time of the month, at most + * the first NCPUS items on the free list are + * locked, so this is close enough to being empty. + */ + if (vp == NULLVP) { + simple_unlock(&vnode_free_list_slock); + tablefull("vnode"); + *vpp = 0; + return (ENFILE); + } + if (vp->v_usecount) + panic("free vnode isn't"); + TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); + /* see comment on why 0xdeadb is set at end of vgone (below) */ + vp->v_freelist.tqe_prev = (struct vnode **)0xdeadb; + simple_unlock(&vnode_free_list_slock); + vp->v_lease = NULL; + if (vp->v_type != VBAD) + vgonel(vp, p); + else + simple_unlock(&vp->v_interlock); +#ifdef DIAGNOSTIC + if (vp->v_data) + panic("cleaned vnode isn't"); + s = splbio(); + if (vp->v_numoutput) + panic("Clean vnode has pending I/O's"); + splx(s); +#endif + vp->v_flag = 0; + vp->v_lastr = 0; + vp->v_ralen = 0; + vp->v_maxra = 0; + vp->v_lastw = 0; + vp->v_lasta = 0; + vp->v_cstart = 0; + vp->v_clen = 0; + vp->v_socket = 0; + } + vp->v_type = VNON; + cache_purge(vp); + vp->v_tag = tag; + vp->v_op = vops; + insmntque(vp, mp); + *vpp = vp; + vp->v_usecount = 1; + vp->v_data = 0; + return (0); +} + +/* + * Move a vnode from one mount queue to another. + */ +void +insmntque(vp, mp) + struct vnode *vp; + struct mount *mp; +{ + + simple_lock(&mntvnode_slock); + /* + * Delete from old mount point vnode list, if on one. + */ + if (vp->v_mount != NULL) + LIST_REMOVE(vp, v_mntvnodes); + /* + * Insert into list of vnodes for the new mount point, if available. + */ + if ((vp->v_mount = mp) != NULL) + LIST_INSERT_HEAD(&mp->mnt_vnodelist, vp, v_mntvnodes); + simple_unlock(&mntvnode_slock); +} + +/* + * Update outstanding I/O count and do wakeup if requested. + */ +void +vwakeup(bp) + register struct buf *bp; +{ + register struct vnode *vp; + + bp->b_flags &= ~B_WRITEINPROG; + if (vp = bp->b_vp) { + if (--vp->v_numoutput < 0) + panic("vwakeup: neg numoutput"); + if ((vp->v_flag & VBWAIT) && vp->v_numoutput <= 0) { + if (vp->v_numoutput < 0) + panic("vwakeup: neg numoutput 2"); + vp->v_flag &= ~VBWAIT; + wakeup((caddr_t)&vp->v_numoutput); + } + } +} + +/* + * Flush out and invalidate all buffers associated with a vnode. + * Called with the underlying object locked. + */ +int +vinvalbuf(vp, flags, cred, p, slpflag, slptimeo) + register struct vnode *vp; + int flags; + struct ucred *cred; + struct proc *p; + int slpflag, slptimeo; +{ + register struct buf *bp; + struct buf *nbp, *blist; + int s, error; + + if (flags & V_SAVE) { + if (error = VOP_FSYNC(vp, cred, MNT_WAIT, p)) + return (error); + if (vp->v_dirtyblkhd.lh_first != NULL) + panic("vinvalbuf: dirty bufs"); + } + for (;;) { + if ((blist = vp->v_cleanblkhd.lh_first) && flags & V_SAVEMETA) + while (blist && blist->b_lblkno < 0) + blist = blist->b_vnbufs.le_next; + if (!blist && (blist = vp->v_dirtyblkhd.lh_first) && + (flags & V_SAVEMETA)) + while (blist && blist->b_lblkno < 0) + blist = blist->b_vnbufs.le_next; + if (!blist) + break; + + for (bp = blist; bp; bp = nbp) { + nbp = bp->b_vnbufs.le_next; + if (flags & V_SAVEMETA && bp->b_lblkno < 0) + continue; + s = splbio(); + if (bp->b_flags & B_BUSY) { + bp->b_flags |= B_WANTED; + error = tsleep((caddr_t)bp, + slpflag | (PRIBIO + 1), "vinvalbuf", + slptimeo); + splx(s); + if (error) + return (error); + break; + } + bremfree(bp); + bp->b_flags |= B_BUSY; + splx(s); + /* + * XXX Since there are no node locks for NFS, I believe + * there is a slight chance that a delayed write will + * occur while sleeping just above, so check for it. + */ + if ((bp->b_flags & B_DELWRI) && (flags & V_SAVE)) { + (void) VOP_BWRITE(bp); + break; + } + bp->b_flags |= B_INVAL; + brelse(bp); + } + } + if (!(flags & V_SAVEMETA) && + (vp->v_dirtyblkhd.lh_first || vp->v_cleanblkhd.lh_first)) + panic("vinvalbuf: flush failed"); + return (0); +} + +/* + * Associate a buffer with a vnode. + */ +void +bgetvp(vp, bp) + register struct vnode *vp; + register struct buf *bp; +{ + + if (bp->b_vp) + panic("bgetvp: not free"); + VHOLD(vp); + bp->b_vp = vp; + if (vp->v_type == VBLK || vp->v_type == VCHR) + bp->b_dev = vp->v_rdev; + else + bp->b_dev = NODEV; + /* + * Insert onto list for new vnode. + */ + bufinsvn(bp, &vp->v_cleanblkhd); +} + +/* + * Disassociate a buffer from a vnode. + */ +void +brelvp(bp) + register struct buf *bp; +{ + struct vnode *vp; + + if (bp->b_vp == (struct vnode *) 0) + panic("brelvp: NULL"); + /* + * Delete from old vnode list, if on one. + */ + if (bp->b_vnbufs.le_next != NOLIST) + bufremvn(bp); + vp = bp->b_vp; + bp->b_vp = (struct vnode *) 0; + HOLDRELE(vp); +} + +/* + * Reassign a buffer from one vnode to another. + * Used to assign file specific control information + * (indirect blocks) to the vnode to which they belong. + */ +void +reassignbuf(bp, newvp) + register struct buf *bp; + register struct vnode *newvp; +{ + register struct buflists *listheadp; + + if (newvp == NULL) { + printf("reassignbuf: NULL"); + return; + } + /* + * Delete from old vnode list, if on one. + */ + if (bp->b_vnbufs.le_next != NOLIST) + bufremvn(bp); + /* + * If dirty, put on list of dirty buffers; + * otherwise insert onto list of clean buffers. + */ + if (bp->b_flags & B_DELWRI) + listheadp = &newvp->v_dirtyblkhd; + else + listheadp = &newvp->v_cleanblkhd; + bufinsvn(bp, listheadp); +} + +/* + * Create a vnode for a block device. + * Used for root filesystem, argdev, and swap areas. + * Also used for memory file system special devices. + */ +int +bdevvp(dev, vpp) + dev_t dev; + struct vnode **vpp; +{ + register struct vnode *vp; + struct vnode *nvp; + int error; + + if (dev == NODEV) { + *vpp = NULLVP; + return (ENODEV); + } + error = getnewvnode(VT_NON, (struct mount *)0, spec_vnodeop_p, &nvp); + if (error) { + *vpp = NULLVP; + return (error); + } + vp = nvp; + vp->v_type = VBLK; + if (nvp = checkalias(vp, dev, (struct mount *)0)) { + vput(vp); + vp = nvp; + } + *vpp = vp; + return (0); +} + +/* + * Check to see if the new vnode represents a special device + * for which we already have a vnode (either because of + * bdevvp() or because of a different vnode representing + * the same block device). If such an alias exists, deallocate + * the existing contents and return the aliased vnode. The + * caller is responsible for filling it with its new contents. + */ +struct vnode * +checkalias(nvp, nvp_rdev, mp) + register struct vnode *nvp; + dev_t nvp_rdev; + struct mount *mp; +{ + struct proc *p = curproc; /* XXX */ + struct vnode *vp; + struct vnode **vpp; + + if (nvp->v_type != VBLK && nvp->v_type != VCHR) + return (NULLVP); + + vpp = &speclisth[SPECHASH(nvp_rdev)]; +loop: + simple_lock(&spechash_slock); + for (vp = *vpp; vp; vp = vp->v_specnext) { + if (nvp_rdev != vp->v_rdev || nvp->v_type != vp->v_type) + continue; + /* + * Alias, but not in use, so flush it out. + */ + simple_lock(&vp->v_interlock); + if (vp->v_usecount == 0) { + simple_unlock(&spechash_slock); + vgonel(vp, p); + goto loop; + } + if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, p)) { + simple_unlock(&spechash_slock); + goto loop; + } + break; + } + if (vp == NULL || vp->v_tag != VT_NON) { + MALLOC(nvp->v_specinfo, struct specinfo *, + sizeof(struct specinfo), M_VNODE, M_WAITOK); + nvp->v_rdev = nvp_rdev; + nvp->v_hashchain = vpp; + nvp->v_specnext = *vpp; + nvp->v_specflags = 0; + simple_unlock(&spechash_slock); + *vpp = nvp; + if (vp != NULLVP) { + nvp->v_flag |= VALIASED; + vp->v_flag |= VALIASED; + vput(vp); + } + return (NULLVP); + } + simple_unlock(&spechash_slock); + VOP_UNLOCK(vp, 0, p); + simple_lock(&vp->v_interlock); + vclean(vp, 0, p); + vp->v_op = nvp->v_op; + vp->v_tag = nvp->v_tag; + nvp->v_type = VNON; + insmntque(vp, mp); + return (vp); +} + +/* + * Grab a particular vnode from the free list, increment its + * reference count and lock it. The vnode lock bit is set the + * vnode is being eliminated in vgone. The process is awakened + * when the transition is completed, and an error returned to + * indicate that the vnode is no longer usable (possibly having + * been changed to a new file system type). + */ +int +vget(vp, flags, p) + struct vnode *vp; + int flags; + struct proc *p; +{ + int error; + + /* + * If the vnode is in the process of being cleaned out for + * another use, we wait for the cleaning to finish and then + * return failure. Cleaning is determined by checking that + * the VXLOCK flag is set. + */ + if ((flags & LK_INTERLOCK) == 0) + simple_lock(&vp->v_interlock); + if (vp->v_flag & VXLOCK) { + vp->v_flag |= VXWANT; + simple_unlock(&vp->v_interlock); + tsleep((caddr_t)vp, PINOD, "vget", 0); + return (ENOENT); + } + if (vp->v_usecount == 0) { + simple_lock(&vnode_free_list_slock); + TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); + simple_unlock(&vnode_free_list_slock); + } + vp->v_usecount++; + if (flags & LK_TYPE_MASK) { + if (error = vn_lock(vp, flags | LK_INTERLOCK, p)) + vrele(vp); + return (error); + } + simple_unlock(&vp->v_interlock); + return (0); +} + +/* + * Stubs to use when there is no locking to be done on the underlying object. + * A minimal shared lock is necessary to ensure that the underlying object + * is not revoked while an operation is in progress. So, an active shared + * count is maintained in an auxillary vnode lock structure. + */ +int +vop_nolock(ap) + struct vop_lock_args /* { + struct vnode *a_vp; + int a_flags; + struct proc *a_p; + } */ *ap; +{ +#ifdef notyet + /* + * This code cannot be used until all the non-locking filesystems + * (notably NFS) are converted to properly lock and release nodes. + * Also, certain vnode operations change the locking state within + * the operation (create, mknod, remove, link, rename, mkdir, rmdir, + * and symlink). Ideally these operations should not change the + * lock state, but should be changed to let the caller of the + * function unlock them. Otherwise all intermediate vnode layers + * (such as union, umapfs, etc) must catch these functions to do + * the necessary locking at their layer. Note that the inactive + * and lookup operations also change their lock state, but this + * cannot be avoided, so these two operations will always need + * to be handled in intermediate layers. + */ + struct vnode *vp = ap->a_vp; + int vnflags, flags = ap->a_flags; + + if (vp->v_vnlock == NULL) { + if ((flags & LK_TYPE_MASK) == LK_DRAIN) + return (0); + MALLOC(vp->v_vnlock, struct lock *, sizeof(struct lock), + M_VNODE, M_WAITOK); + lockinit(vp->v_vnlock, PVFS, "vnlock", 0, 0); + } + switch (flags & LK_TYPE_MASK) { + case LK_DRAIN: + vnflags = LK_DRAIN; + break; + case LK_EXCLUSIVE: + case LK_SHARED: + vnflags = LK_SHARED; + break; + case LK_UPGRADE: + case LK_EXCLUPGRADE: + case LK_DOWNGRADE: + return (0); + case LK_RELEASE: + default: + panic("vop_nolock: bad operation %d", flags & LK_TYPE_MASK); + } + if (flags & LK_INTERLOCK) + vnflags |= LK_INTERLOCK; + return(lockmgr(vp->v_vnlock, vnflags, &vp->v_interlock, ap->a_p)); +#else /* for now */ + /* + * Since we are not using the lock manager, we must clear + * the interlock here. + */ + if (ap->a_flags & LK_INTERLOCK) + simple_unlock(&ap->a_vp->v_interlock); + return (0); +#endif +} + +/* + * Decrement the active use count. + */ +int +vop_nounlock(ap) + struct vop_unlock_args /* { + struct vnode *a_vp; + int a_flags; + struct proc *a_p; + } */ *ap; +{ + struct vnode *vp = ap->a_vp; + + if (vp->v_vnlock == NULL) + return (0); + return (lockmgr(vp->v_vnlock, LK_RELEASE, NULL, ap->a_p)); +} + +/* + * Return whether or not the node is in use. + */ +int +vop_noislocked(ap) + struct vop_islocked_args /* { + struct vnode *a_vp; + } */ *ap; +{ + struct vnode *vp = ap->a_vp; + + if (vp->v_vnlock == NULL) + return (0); + return (lockstatus(vp->v_vnlock)); +} + +/* + * Vnode reference. + */ +void +vref(vp) + struct vnode *vp; +{ + + simple_lock(&vp->v_interlock); + if (vp->v_usecount <= 0) + panic("vref used where vget required"); + vp->v_usecount++; + simple_unlock(&vp->v_interlock); +} + +/* + * vput(), just unlock and vrele() + */ +void +vput(vp) + struct vnode *vp; +{ + struct proc *p = curproc; /* XXX */ + +#ifdef DIGANOSTIC + if (vp == NULL) + panic("vput: null vp"); +#endif + simple_lock(&vp->v_interlock); + vp->v_usecount--; + if (vp->v_usecount > 0) { + simple_unlock(&vp->v_interlock); + VOP_UNLOCK(vp, 0, p); + return; + } +#ifdef DIAGNOSTIC + if (vp->v_usecount < 0 || vp->v_writecount != 0) { + vprint("vput: bad ref count", vp); + panic("vput: ref cnt"); + } +#endif + /* + * insert at tail of LRU list + */ + simple_lock(&vnode_free_list_slock); + TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); + simple_unlock(&vnode_free_list_slock); + simple_unlock(&vp->v_interlock); + VOP_INACTIVE(vp, p); +} + +/* + * Vnode release. + * If count drops to zero, call inactive routine and return to freelist. + */ +void +vrele(vp) + struct vnode *vp; +{ + struct proc *p = curproc; /* XXX */ + +#ifdef DIAGNOSTIC + if (vp == NULL) + panic("vrele: null vp"); +#endif + simple_lock(&vp->v_interlock); + vp->v_usecount--; + if (vp->v_usecount > 0) { + simple_unlock(&vp->v_interlock); + return; + } +#ifdef DIAGNOSTIC + if (vp->v_usecount < 0 || vp->v_writecount != 0) { + vprint("vrele: bad ref count", vp); + panic("vrele: ref cnt"); + } +#endif + /* + * insert at tail of LRU list + */ + simple_lock(&vnode_free_list_slock); + TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); + simple_unlock(&vnode_free_list_slock); + if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK, p) == 0) + VOP_INACTIVE(vp, p); +} + +#ifdef DIAGNOSTIC +/* + * Page or buffer structure gets a reference. + */ +void +vhold(vp) + register struct vnode *vp; +{ + + simple_lock(&vp->v_interlock); + vp->v_holdcnt++; + simple_unlock(&vp->v_interlock); +} + +/* + * Page or buffer structure frees a reference. + */ +void +holdrele(vp) + register struct vnode *vp; +{ + + simple_lock(&vp->v_interlock); + if (vp->v_holdcnt <= 0) + panic("holdrele: holdcnt"); + vp->v_holdcnt--; + simple_unlock(&vp->v_interlock); +} +#endif /* DIAGNOSTIC */ + +/* + * Remove any vnodes in the vnode table belonging to mount point mp. + * + * If MNT_NOFORCE is specified, there should not be any active ones, + * return error if any are found (nb: this is a user error, not a + * system error). If MNT_FORCE is specified, detach any active vnodes + * that are found. + */ +#ifdef DIAGNOSTIC +int busyprt = 0; /* print out busy vnodes */ +struct ctldebug debug1 = { "busyprt", &busyprt }; +#endif + +int +vflush(mp, skipvp, flags) + struct mount *mp; + struct vnode *skipvp; + int flags; +{ + struct proc *p = curproc; /* XXX */ + struct vnode *vp, *nvp; + int busy = 0; + + simple_lock(&mntvnode_slock); +loop: + for (vp = mp->mnt_vnodelist.lh_first; vp; vp = nvp) { + if (vp->v_mount != mp) + goto loop; + nvp = vp->v_mntvnodes.le_next; + /* + * Skip over a selected vnode. + */ + if (vp == skipvp) + continue; + + simple_lock(&vp->v_interlock); + /* + * Skip over a vnodes marked VSYSTEM. + */ + if ((flags & SKIPSYSTEM) && (vp->v_flag & VSYSTEM)) { + simple_unlock(&vp->v_interlock); + continue; + } + /* + * If WRITECLOSE is set, only flush out regular file + * vnodes open for writing. + */ + if ((flags & WRITECLOSE) && + (vp->v_writecount == 0 || vp->v_type != VREG)) { + simple_unlock(&vp->v_interlock); + continue; + } + /* + * With v_usecount == 0, all we need to do is clear + * out the vnode data structures and we are done. + */ + if (vp->v_usecount == 0) { + simple_unlock(&mntvnode_slock); + vgonel(vp, p); + simple_lock(&mntvnode_slock); + continue; + } + /* + * If FORCECLOSE is set, forcibly close the vnode. + * For block or character devices, revert to an + * anonymous device. For all other files, just kill them. + */ + if (flags & FORCECLOSE) { + simple_unlock(&mntvnode_slock); + if (vp->v_type != VBLK && vp->v_type != VCHR) { + vgonel(vp, p); + } else { + vclean(vp, 0, p); + vp->v_op = spec_vnodeop_p; + insmntque(vp, (struct mount *)0); + } + simple_lock(&mntvnode_slock); + continue; + } +#ifdef DIAGNOSTIC + if (busyprt) + vprint("vflush: busy vnode", vp); +#endif + simple_unlock(&vp->v_interlock); + busy++; + } + simple_unlock(&mntvnode_slock); + if (busy) + return (EBUSY); + return (0); +} + +/* + * Disassociate the underlying file system from a vnode. + * The vnode interlock is held on entry. + */ +static void +vclean(vp, flags, p) + struct vnode *vp; + int flags; + struct proc *p; +{ + int active; + + /* + * Check to see if the vnode is in use. + * If so we have to reference it before we clean it out + * so that its count cannot fall to zero and generate a + * race against ourselves to recycle it. + */ + if (active = vp->v_usecount) + vp->v_usecount++; + /* + * Prevent the vnode from being recycled or + * brought into use while we clean it out. + */ + if (vp->v_flag & VXLOCK) + panic("vclean: deadlock"); + vp->v_flag |= VXLOCK; + /* + * Even if the count is zero, the VOP_INACTIVE routine may still + * have the object locked while it cleans it out. The VOP_LOCK + * ensures that the VOP_INACTIVE routine is done with its work. + * For active vnodes, it ensures that no other activity can + * occur while the underlying object is being cleaned out. + */ + VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK, p); + /* + * Clean out any buffers associated with the vnode. + */ + if (flags & DOCLOSE) + vinvalbuf(vp, V_SAVE, NOCRED, p, 0, 0); + /* + * If purging an active vnode, it must be closed and + * deactivated before being reclaimed. Note that the + * VOP_INACTIVE will unlock the vnode. + */ + if (active) { + if (flags & DOCLOSE) + VOP_CLOSE(vp, IO_NDELAY, NOCRED, p); + VOP_INACTIVE(vp, p); + } else { + /* + * Any other processes trying to obtain this lock must first + * wait for VXLOCK to clear, then call the new lock operation. + */ + VOP_UNLOCK(vp, 0, p); + } + /* + * Reclaim the vnode. + */ + if (VOP_RECLAIM(vp, p)) + panic("vclean: cannot reclaim"); + if (active) + vrele(vp); + cache_purge(vp); + if (vp->v_vnlock) { + if ((vp->v_vnlock->lk_flags & LK_DRAINED) == 0) + vprint("vclean: lock not drained", vp); + FREE(vp->v_vnlock, M_VNODE); + vp->v_vnlock = NULL; + } + + /* + * Done with purge, notify sleepers of the grim news. + */ + vp->v_op = dead_vnodeop_p; + vp->v_tag = VT_NON; + vp->v_flag &= ~VXLOCK; + if (vp->v_flag & VXWANT) { + vp->v_flag &= ~VXWANT; + wakeup((caddr_t)vp); + } +} + +/* + * Eliminate all activity associated with the requested vnode + * and with all vnodes aliased to the requested vnode. + */ +int +vop_revoke(ap) + struct vop_revoke_args /* { + struct vnode *a_vp; + int a_flags; + } */ *ap; +{ + struct vnode *vp, *vq; + struct proc *p = curproc; /* XXX */ + +#ifdef DIAGNOSTIC + if ((ap->a_flags & REVOKEALL) == 0) + panic("vop_revoke"); +#endif + + vp = ap->a_vp; + simple_lock(&vp->v_interlock); + + if (vp->v_flag & VALIASED) { + /* + * If a vgone (or vclean) is already in progress, + * wait until it is done and return. + */ + if (vp->v_flag & VXLOCK) { + vp->v_flag |= VXWANT; + simple_unlock(&vp->v_interlock); + tsleep((caddr_t)vp, PINOD, "vop_revokeall", 0); + return (0); + } + /* + * Ensure that vp will not be vgone'd while we + * are eliminating its aliases. + */ + vp->v_flag |= VXLOCK; + simple_unlock(&vp->v_interlock); + while (vp->v_flag & VALIASED) { + simple_lock(&spechash_slock); + for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) { + if (vq->v_rdev != vp->v_rdev || + vq->v_type != vp->v_type || vp == vq) + continue; + simple_unlock(&spechash_slock); + vgone(vq); + break; + } + if (vq == NULLVP) + simple_unlock(&spechash_slock); + } + /* + * Remove the lock so that vgone below will + * really eliminate the vnode after which time + * vgone will awaken any sleepers. + */ + simple_lock(&vp->v_interlock); + vp->v_flag &= ~VXLOCK; + } + vgonel(vp, p); + return (0); +} + +/* + * Recycle an unused vnode to the front of the free list. + * Release the passed interlock if the vnode will be recycled. + */ +int +vrecycle(vp, inter_lkp, p) + struct vnode *vp; + struct simplelock *inter_lkp; + struct proc *p; +{ + + simple_lock(&vp->v_interlock); + if (vp->v_usecount == 0) { + if (inter_lkp) + simple_unlock(inter_lkp); + vgonel(vp, p); + return (1); + } + simple_unlock(&vp->v_interlock); + return (0); +} + +/* + * Eliminate all activity associated with a vnode + * in preparation for reuse. + */ +void +vgone(vp) + struct vnode *vp; +{ + struct proc *p = curproc; /* XXX */ + + simple_lock(&vp->v_interlock); + vgonel(vp, p); +} + +/* + * vgone, with the vp interlock held. + */ +void +vgonel(vp, p) + struct vnode *vp; + struct proc *p; +{ + struct vnode *vq; + struct vnode *vx; + + /* + * If a vgone (or vclean) is already in progress, + * wait until it is done and return. + */ + if (vp->v_flag & VXLOCK) { + vp->v_flag |= VXWANT; + simple_unlock(&vp->v_interlock); + tsleep((caddr_t)vp, PINOD, "vgone", 0); + return; + } + /* + * Clean out the filesystem specific data. + */ + vclean(vp, DOCLOSE, p); + /* + * Delete from old mount point vnode list, if on one. + */ + if (vp->v_mount != NULL) + insmntque(vp, (struct mount *)0); + /* + * If special device, remove it from special device alias list + * if it is on one. + */ + if ((vp->v_type == VBLK || vp->v_type == VCHR) && vp->v_specinfo != 0) { + simple_lock(&spechash_slock); + if (*vp->v_hashchain == vp) { + *vp->v_hashchain = vp->v_specnext; + } else { + for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) { + if (vq->v_specnext != vp) + continue; + vq->v_specnext = vp->v_specnext; + break; + } + if (vq == NULL) + panic("missing bdev"); + } + if (vp->v_flag & VALIASED) { + vx = NULL; + for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) { + if (vq->v_rdev != vp->v_rdev || + vq->v_type != vp->v_type) + continue; + if (vx) + break; + vx = vq; + } + if (vx == NULL) + panic("missing alias"); + if (vq == NULL) + vx->v_flag &= ~VALIASED; + vp->v_flag &= ~VALIASED; + } + simple_unlock(&spechash_slock); + FREE(vp->v_specinfo, M_VNODE); + vp->v_specinfo = NULL; + } + /* + * If it is on the freelist and not already at the head, + * move it to the head of the list. The test of the back + * pointer and the reference count of zero is because + * it will be removed from the free list by getnewvnode, + * but will not have its reference count incremented until + * after calling vgone. If the reference count were + * incremented first, vgone would (incorrectly) try to + * close the previous instance of the underlying object. + * So, the back pointer is explicitly set to `0xdeadb' in + * getnewvnode after removing it from the freelist to ensure + * that we do not try to move it here. + */ + if (vp->v_usecount == 0) { + simple_lock(&vnode_free_list_slock); + if ((vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb) && + vnode_free_list.tqh_first != vp) { + TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); + TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); + } + simple_unlock(&vnode_free_list_slock); + } + vp->v_type = VBAD; +} + +/* + * Lookup a vnode by device number. + */ +int +vfinddev(dev, type, vpp) + dev_t dev; + enum vtype type; + struct vnode **vpp; +{ + struct vnode *vp; + int rc = 0; + + simple_lock(&spechash_slock); + for (vp = speclisth[SPECHASH(dev)]; vp; vp = vp->v_specnext) { + if (dev != vp->v_rdev || type != vp->v_type) + continue; + *vpp = vp; + rc = 1; + break; + } + simple_unlock(&spechash_slock); + return (rc); +} + +/* + * Calculate the total number of references to a special device. + */ +int +vcount(vp) + struct vnode *vp; +{ + struct vnode *vq, *vnext; + int count; + +loop: + if ((vp->v_flag & VALIASED) == 0) + return (vp->v_usecount); + simple_lock(&spechash_slock); + for (count = 0, vq = *vp->v_hashchain; vq; vq = vnext) { + vnext = vq->v_specnext; + if (vq->v_rdev != vp->v_rdev || vq->v_type != vp->v_type) + continue; + /* + * Alias, but not in use, so flush it out. + */ + if (vq->v_usecount == 0 && vq != vp) { + simple_unlock(&spechash_slock); + vgone(vq); + goto loop; + } + count += vq->v_usecount; + } + simple_unlock(&spechash_slock); + return (count); +} + +/* + * Print out a description of a vnode. + */ +static char *typename[] = + { "VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD" }; + +void +vprint(label, vp) + char *label; + register struct vnode *vp; +{ + char buf[64]; + + if (label != NULL) + printf("%s: ", label); + printf("type %s, usecount %d, writecount %d, refcount %d,", + typename[vp->v_type], vp->v_usecount, vp->v_writecount, + vp->v_holdcnt); + buf[0] = '\0'; + if (vp->v_flag & VROOT) + strcat(buf, "|VROOT"); + if (vp->v_flag & VTEXT) + strcat(buf, "|VTEXT"); + if (vp->v_flag & VSYSTEM) + strcat(buf, "|VSYSTEM"); + if (vp->v_flag & VXLOCK) + strcat(buf, "|VXLOCK"); + if (vp->v_flag & VXWANT) + strcat(buf, "|VXWANT"); + if (vp->v_flag & VBWAIT) + strcat(buf, "|VBWAIT"); + if (vp->v_flag & VALIASED) + strcat(buf, "|VALIASED"); + if (buf[0] != '\0') + printf(" flags (%s)", &buf[1]); + if (vp->v_data == NULL) { + printf("\n"); + } else { + printf("\n\t"); + VOP_PRINT(vp); + } +} + +#ifdef DEBUG +/* + * List all of the locked vnodes in the system. + * Called when debugging the kernel. + */ +void +printlockedvnodes() +{ + struct proc *p = curproc; /* XXX */ + struct mount *mp, *nmp; + struct vnode *vp; + + printf("Locked vnodes\n"); + simple_lock(&mountlist_slock); + for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = nmp) { + if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) { + nmp = mp->mnt_list.cqe_next; + continue; + } + for (vp = mp->mnt_vnodelist.lh_first; + vp != NULL; + vp = vp->v_mntvnodes.le_next) { + if (VOP_ISLOCKED(vp)) + vprint((char *)0, vp); + } + simple_lock(&mountlist_slock); + nmp = mp->mnt_list.cqe_next; + vfs_unbusy(mp, p); + } + simple_unlock(&mountlist_slock); +} +#endif + +/* + * Top level filesystem related information gathering. + */ +int +vfs_sysctl(name, namelen, oldp, oldlenp, newp, newlen, p) + int *name; + u_int namelen; + void *oldp; + size_t *oldlenp; + void *newp; + size_t newlen; + struct proc *p; +{ + struct ctldebug *cdp; + struct vfsconf *vfsp; + + /* all sysctl names at this level are at least name and field */ + if (namelen < 2) + return (ENOTDIR); /* overloaded */ + if (name[0] != VFS_GENERIC) { + for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) + if (vfsp->vfc_typenum == name[0]) + break; + if (vfsp == NULL) + return (EOPNOTSUPP); + return ((*vfsp->vfc_vfsops->vfs_sysctl)(&name[1], namelen - 1, + oldp, oldlenp, newp, newlen, p)); + } + switch (name[1]) { + case VFS_MAXTYPENUM: + return (sysctl_rdint(oldp, oldlenp, newp, maxvfsconf)); + case VFS_CONF: + if (namelen < 3) + return (ENOTDIR); /* overloaded */ + for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) + if (vfsp->vfc_typenum == name[2]) + break; + if (vfsp == NULL) + return (EOPNOTSUPP); + return (sysctl_rdstruct(oldp, oldlenp, newp, vfsp, + sizeof(struct vfsconf))); + } + return (EOPNOTSUPP); +} + +int kinfo_vdebug = 1; +int kinfo_vgetfailed; +#define KINFO_VNODESLOP 10 +/* + * Dump vnode list (via sysctl). + * Copyout address of vnode followed by vnode. + */ +/* ARGSUSED */ +int +sysctl_vnode(where, sizep, p) + char *where; + size_t *sizep; + struct proc *p; +{ + struct mount *mp, *nmp; + struct vnode *nvp, *vp; + char *bp = where, *savebp; + char *ewhere; + int error; + +#define VPTRSZ sizeof (struct vnode *) +#define VNODESZ sizeof (struct vnode) + if (where == NULL) { + *sizep = (numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ); + return (0); + } + ewhere = where + *sizep; + + simple_lock(&mountlist_slock); + for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = nmp) { + if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) { + nmp = mp->mnt_list.cqe_next; + continue; + } + savebp = bp; +again: + simple_lock(&mntvnode_slock); + for (vp = mp->mnt_vnodelist.lh_first; + vp != NULL; + vp = nvp) { + /* + * Check that the vp is still associated with + * this filesystem. RACE: could have been + * recycled onto the same filesystem. + */ + if (vp->v_mount != mp) { + simple_unlock(&mntvnode_slock); + if (kinfo_vdebug) + printf("kinfo: vp changed\n"); + bp = savebp; + goto again; + } + nvp = vp->v_mntvnodes.le_next; + if (bp + VPTRSZ + VNODESZ > ewhere) { + simple_unlock(&mntvnode_slock); + *sizep = bp - where; + return (ENOMEM); + } + simple_unlock(&mntvnode_slock); + if ((error = copyout((caddr_t)&vp, bp, VPTRSZ)) || + (error = copyout((caddr_t)vp, bp + VPTRSZ, VNODESZ))) + return (error); + bp += VPTRSZ + VNODESZ; + simple_lock(&mntvnode_slock); + } + simple_unlock(&mntvnode_slock); + simple_lock(&mountlist_slock); + nmp = mp->mnt_list.cqe_next; + vfs_unbusy(mp, p); + } + simple_unlock(&mountlist_slock); + + *sizep = bp - where; + return (0); +} + +/* + * Check to see if a filesystem is mounted on a block device. + */ +int +vfs_mountedon(vp) + struct vnode *vp; +{ + struct vnode *vq; + int error = 0; + + if (vp->v_specflags & SI_MOUNTEDON) + return (EBUSY); + if (vp->v_flag & VALIASED) { + simple_lock(&spechash_slock); + for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) { + if (vq->v_rdev != vp->v_rdev || + vq->v_type != vp->v_type) + continue; + if (vq->v_specflags & SI_MOUNTEDON) { + error = EBUSY; + break; + } + } + simple_unlock(&spechash_slock); + } + return (error); +} + +/* + * Unmount all filesystems. The list is traversed in reverse order + * of mounting to avoid dependencies. + */ +void +vfs_unmountall() +{ + struct mount *mp, *nmp; + struct proc *p = curproc; /* XXX */ + + /* + * Since this only runs when rebooting, it is not interlocked. + */ + for (mp = mountlist.cqh_last; mp != (void *)&mountlist; mp = nmp) { + nmp = mp->mnt_list.cqe_prev; + (void) dounmount(mp, MNT_FORCE, p); + } +} + +/* + * Build hash lists of net addresses and hang them off the mount point. + * Called by ufs_mount() to set up the lists of export addresses. + */ +static int +vfs_hang_addrlist(mp, nep, argp) + struct mount *mp; + struct netexport *nep; + struct export_args *argp; +{ + register struct netcred *np; + register struct radix_node_head *rnh; + register int i; + struct radix_node *rn; + struct sockaddr *saddr, *smask = 0; + struct domain *dom; + int error; + + if (argp->ex_addrlen == 0) { + if (mp->mnt_flag & MNT_DEFEXPORTED) + return (EPERM); + np = &nep->ne_defexported; + np->netc_exflags = argp->ex_flags; + np->netc_anon = argp->ex_anon; + np->netc_anon.cr_ref = 1; + mp->mnt_flag |= MNT_DEFEXPORTED; + return (0); + } + i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen; + np = (struct netcred *)malloc(i, M_NETADDR, M_WAITOK); + bzero((caddr_t)np, i); + saddr = (struct sockaddr *)(np + 1); + if (error = copyin(argp->ex_addr, (caddr_t)saddr, argp->ex_addrlen)) + goto out; + if (saddr->sa_len > argp->ex_addrlen) + saddr->sa_len = argp->ex_addrlen; + if (argp->ex_masklen) { + smask = (struct sockaddr *)((caddr_t)saddr + argp->ex_addrlen); + error = copyin(argp->ex_addr, (caddr_t)smask, argp->ex_masklen); + if (error) + goto out; + if (smask->sa_len > argp->ex_masklen) + smask->sa_len = argp->ex_masklen; + } + i = saddr->sa_family; + if ((rnh = nep->ne_rtable[i]) == 0) { + /* + * Seems silly to initialize every AF when most are not + * used, do so on demand here + */ + for (dom = domains; dom; dom = dom->dom_next) + if (dom->dom_family == i && dom->dom_rtattach) { + dom->dom_rtattach((void **)&nep->ne_rtable[i], + dom->dom_rtoffset); + break; + } + if ((rnh = nep->ne_rtable[i]) == 0) { + error = ENOBUFS; + goto out; + } + } + rn = (*rnh->rnh_addaddr)((caddr_t)saddr, (caddr_t)smask, rnh, + np->netc_rnodes); + if (rn == 0) { + /* + * One of the reasons that rnh_addaddr may fail is that + * the entry already exists. To check for this case, we + * look up the entry to see if it is there. If so, we + * do not need to make a new entry but do return success. + */ + free(np, M_NETADDR); + rn = (*rnh->rnh_matchaddr)((caddr_t)saddr, rnh); + if (rn != 0 && (rn->rn_flags & RNF_ROOT) == 0 && + ((struct netcred *)rn)->netc_exflags == argp->ex_flags && + !bcmp((caddr_t)&((struct netcred *)rn)->netc_anon, + (caddr_t)&argp->ex_anon, sizeof(struct ucred))) + return (0); + return (EPERM); + } + np->netc_exflags = argp->ex_flags; + np->netc_anon = argp->ex_anon; + np->netc_anon.cr_ref = 1; + return (0); +out: + free(np, M_NETADDR); + return (error); +} + +/* ARGSUSED */ +static int +vfs_free_netcred(rn, w) + struct radix_node *rn; + caddr_t w; +{ + register struct radix_node_head *rnh = (struct radix_node_head *)w; + + (*rnh->rnh_deladdr)(rn->rn_key, rn->rn_mask, rnh); + free((caddr_t)rn, M_NETADDR); + return (0); +} + +/* + * Free the net address hash lists that are hanging off the mount points. + */ +static void +vfs_free_addrlist(nep) + struct netexport *nep; +{ + register int i; + register struct radix_node_head *rnh; + + for (i = 0; i <= AF_MAX; i++) + if (rnh = nep->ne_rtable[i]) { + (*rnh->rnh_walktree)(rnh, vfs_free_netcred, + (caddr_t)rnh); + free((caddr_t)rnh, M_RTABLE); + nep->ne_rtable[i] = 0; + } +} + +int +vfs_export(mp, nep, argp) + struct mount *mp; + struct netexport *nep; + struct export_args *argp; +{ + int error; + + if (argp->ex_flags & MNT_DELEXPORT) { + vfs_free_addrlist(nep); + mp->mnt_flag &= ~(MNT_EXPORTED | MNT_DEFEXPORTED); + } + if (argp->ex_flags & MNT_EXPORTED) { + if (error = vfs_hang_addrlist(mp, nep, argp)) + return (error); + mp->mnt_flag |= MNT_EXPORTED; + } + return (0); +} + +struct netcred * +vfs_export_lookup(mp, nep, nam) + register struct mount *mp; + struct netexport *nep; + struct mbuf *nam; +{ + register struct netcred *np; + register struct radix_node_head *rnh; + struct sockaddr *saddr; + + np = NULL; + if (mp->mnt_flag & MNT_EXPORTED) { + /* + * Lookup in the export list first. + */ + if (nam != NULL) { + saddr = mtod(nam, struct sockaddr *); + rnh = nep->ne_rtable[saddr->sa_family]; + if (rnh != NULL) { + np = (struct netcred *) + (*rnh->rnh_matchaddr)((caddr_t)saddr, + rnh); + if (np && np->netc_rnodes->rn_flags & RNF_ROOT) + np = NULL; + } + } + /* + * If no address match, use the default if it exists. + */ + if (np == NULL && mp->mnt_flag & MNT_DEFEXPORTED) + np = &nep->ne_defexported; + } + return (np); +} diff --git a/sys/kern/vfs_syscalls.c b/sys/kern/vfs_syscalls.c new file mode 100644 index 000000000000..0cf7680ec9d4 --- /dev/null +++ b/sys/kern/vfs_syscalls.c @@ -0,0 +1,2417 @@ +/* + * Copyright (c) 1989, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vfs_syscalls.c 8.41 (Berkeley) 6/15/95 + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/namei.h> +#include <sys/filedesc.h> +#include <sys/kernel.h> +#include <sys/file.h> +#include <sys/stat.h> +#include <sys/vnode.h> +#include <sys/mount.h> +#include <sys/proc.h> +#include <sys/uio.h> +#include <sys/malloc.h> +#include <sys/dirent.h> + +#include <sys/syscallargs.h> + +#include <vm/vm.h> +#include <sys/sysctl.h> + +static int change_dir __P((struct nameidata *ndp, struct proc *p)); +static void checkdirs __P((struct vnode *olddp)); + +/* + * Virtual File System System Calls + */ + +/* + * Mount a file system. + */ +/* ARGSUSED */ +int +mount(p, uap, retval) + struct proc *p; + register struct mount_args /* { + syscallarg(char *) type; + syscallarg(char *) path; + syscallarg(int) flags; + syscallarg(caddr_t) data; + } */ *uap; + register_t *retval; +{ + struct vnode *vp; + struct mount *mp; + struct vfsconf *vfsp; + int error, flag; + struct vattr va; + u_long fstypenum; + struct nameidata nd; + char fstypename[MFSNAMELEN]; + + /* + * Get vnode to be covered + */ + NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE, + SCARG(uap, path), p); + if (error = namei(&nd)) + return (error); + vp = nd.ni_vp; + if (SCARG(uap, flags) & MNT_UPDATE) { + if ((vp->v_flag & VROOT) == 0) { + vput(vp); + return (EINVAL); + } + mp = vp->v_mount; + flag = mp->mnt_flag; + /* + * We only allow the filesystem to be reloaded if it + * is currently mounted read-only. + */ + if ((SCARG(uap, flags) & MNT_RELOAD) && + ((mp->mnt_flag & MNT_RDONLY) == 0)) { + vput(vp); + return (EOPNOTSUPP); /* Needs translation */ + } + mp->mnt_flag |= + SCARG(uap, flags) & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE); + /* + * Only root, or the user that did the original mount is + * permitted to update it. + */ + if (mp->mnt_stat.f_owner != p->p_ucred->cr_uid && + (error = suser(p->p_ucred, &p->p_acflag))) { + vput(vp); + return (error); + } + /* + * Do not allow NFS export by non-root users. Silently + * enforce MNT_NOSUID and MNT_NODEV for non-root users. + */ + if (p->p_ucred->cr_uid != 0) { + if (SCARG(uap, flags) & MNT_EXPORTED) { + vput(vp); + return (EPERM); + } + SCARG(uap, flags) |= MNT_NOSUID | MNT_NODEV; + } + if (vfs_busy(mp, LK_NOWAIT, 0, p)) { + vput(vp); + return (EBUSY); + } + VOP_UNLOCK(vp, 0, p); + goto update; + } + /* + * If the user is not root, ensure that they own the directory + * onto which we are attempting to mount. + */ + if ((error = VOP_GETATTR(vp, &va, p->p_ucred, p)) || + (va.va_uid != p->p_ucred->cr_uid && + (error = suser(p->p_ucred, &p->p_acflag)))) { + vput(vp); + return (error); + } + /* + * Do not allow NFS export by non-root users. Silently + * enforce MNT_NOSUID and MNT_NODEV for non-root users. + */ + if (p->p_ucred->cr_uid != 0) { + if (SCARG(uap, flags) & MNT_EXPORTED) { + vput(vp); + return (EPERM); + } + SCARG(uap, flags) |= MNT_NOSUID | MNT_NODEV; + } + if (error = vinvalbuf(vp, V_SAVE, p->p_ucred, p, 0, 0)) + return (error); + if (vp->v_type != VDIR) { + vput(vp); + return (ENOTDIR); + } +#ifdef COMPAT_43 + /* + * Historically filesystem types were identified by number. If we + * get an integer for the filesystem type instead of a string, we + * check to see if it matches one of the historic filesystem types. + */ + fstypenum = (u_long)SCARG(uap, type); + if (fstypenum < maxvfsconf) { + for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) + if (vfsp->vfc_typenum == fstypenum) + break; + if (vfsp == NULL) { + vput(vp); + return (ENODEV); + } + strncpy(fstypename, vfsp->vfc_name, MFSNAMELEN); + } else +#endif /* COMPAT_43 */ + if (error = copyinstr(SCARG(uap, type), fstypename, MFSNAMELEN, NULL)) { + vput(vp); + return (error); + } + for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) + if (!strcmp(vfsp->vfc_name, fstypename)) + break; + if (vfsp == NULL) { + vput(vp); + return (ENODEV); + } + if (vp->v_mountedhere != NULL) { + vput(vp); + return (EBUSY); + } + + /* + * Allocate and initialize the filesystem. + */ + mp = (struct mount *)malloc((u_long)sizeof(struct mount), + M_MOUNT, M_WAITOK); + bzero((char *)mp, (u_long)sizeof(struct mount)); + lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, 0); + (void)vfs_busy(mp, LK_NOWAIT, 0, p); + mp->mnt_op = vfsp->vfc_vfsops; + mp->mnt_vfc = vfsp; + vfsp->vfc_refcount++; + mp->mnt_stat.f_type = vfsp->vfc_typenum; + mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK; + strncpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN); + vp->v_mountedhere = mp; + mp->mnt_vnodecovered = vp; + mp->mnt_stat.f_owner = p->p_ucred->cr_uid; +update: + /* + * Set the mount level flags. + */ + if (SCARG(uap, flags) & MNT_RDONLY) + mp->mnt_flag |= MNT_RDONLY; + else if (mp->mnt_flag & MNT_RDONLY) + mp->mnt_flag |= MNT_WANTRDWR; + mp->mnt_flag &=~ (MNT_NOSUID | MNT_NOEXEC | MNT_NODEV | + MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC); + mp->mnt_flag |= SCARG(uap, flags) & (MNT_NOSUID | MNT_NOEXEC | + MNT_NODEV | MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC); + /* + * Mount the filesystem. + */ + error = VFS_MOUNT(mp, SCARG(uap, path), SCARG(uap, data), &nd, p); + if (mp->mnt_flag & MNT_UPDATE) { + vrele(vp); + if (mp->mnt_flag & MNT_WANTRDWR) + mp->mnt_flag &= ~MNT_RDONLY; + mp->mnt_flag &=~ + (MNT_UPDATE | MNT_RELOAD | MNT_FORCE | MNT_WANTRDWR); + if (error) + mp->mnt_flag = flag; + vfs_unbusy(mp, p); + return (error); + } + /* + * Put the new filesystem on the mount list after root. + */ + cache_purge(vp); + if (!error) { + simple_lock(&mountlist_slock); + CIRCLEQ_INSERT_TAIL(&mountlist, mp, mnt_list); + simple_unlock(&mountlist_slock); + checkdirs(vp); + VOP_UNLOCK(vp, 0, p); + vfs_unbusy(mp, p); + if (error = VFS_START(mp, 0, p)) + vrele(vp); + } else { + mp->mnt_vnodecovered->v_mountedhere = (struct mount *)0; + mp->mnt_vfc->vfc_refcount--; + vfs_unbusy(mp, p); + free((caddr_t)mp, M_MOUNT); + vput(vp); + } + return (error); +} + +/* + * Scan all active processes to see if any of them have a current + * or root directory onto which the new filesystem has just been + * mounted. If so, replace them with the new mount point. + */ +static void +checkdirs(olddp) + struct vnode *olddp; +{ + struct filedesc *fdp; + struct vnode *newdp; + struct proc *p; + + if (olddp->v_usecount == 1) + return; + if (VFS_ROOT(olddp->v_mountedhere, &newdp)) + panic("mount: lost mount"); + for (p = allproc.lh_first; p != 0; p = p->p_list.le_next) { + fdp = p->p_fd; + if (fdp->fd_cdir == olddp) { + vrele(fdp->fd_cdir); + VREF(newdp); + fdp->fd_cdir = newdp; + } + if (fdp->fd_rdir == olddp) { + vrele(fdp->fd_rdir); + VREF(newdp); + fdp->fd_rdir = newdp; + } + } + if (rootvnode == olddp) { + vrele(rootvnode); + VREF(newdp); + rootvnode = newdp; + } + vput(newdp); +} + +/* + * Unmount a file system. + * + * Note: unmount takes a path to the vnode mounted on as argument, + * not special file (as before). + */ +/* ARGSUSED */ +int +unmount(p, uap, retval) + struct proc *p; + register struct unmount_args /* { + syscallarg(char *) path; + syscallarg(int) flags; + } */ *uap; + register_t *retval; +{ + register struct vnode *vp; + struct mount *mp; + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE, + SCARG(uap, path), p); + if (error = namei(&nd)) + return (error); + vp = nd.ni_vp; + mp = vp->v_mount; + + /* + * Only root, or the user that did the original mount is + * permitted to unmount this filesystem. + */ + if ((mp->mnt_stat.f_owner != p->p_ucred->cr_uid) && + (error = suser(p->p_ucred, &p->p_acflag))) { + vput(vp); + return (error); + } + + /* + * Don't allow unmounting the root file system. + */ + if (mp->mnt_flag & MNT_ROOTFS) { + vput(vp); + return (EINVAL); + } + + /* + * Must be the root of the filesystem + */ + if ((vp->v_flag & VROOT) == 0) { + vput(vp); + return (EINVAL); + } + vput(vp); + return (dounmount(mp, SCARG(uap, flags), p)); +} + +/* + * Do the actual file system unmount. + */ +int +dounmount(mp, flags, p) + register struct mount *mp; + int flags; + struct proc *p; +{ + struct vnode *coveredvp; + int error; + + simple_lock(&mountlist_slock); + mp->mnt_flag |= MNT_UNMOUNT; + lockmgr(&mp->mnt_lock, LK_DRAIN | LK_INTERLOCK, &mountlist_slock, p); + mp->mnt_flag &=~ MNT_ASYNC; + vnode_pager_umount(mp); /* release cached vnodes */ + cache_purgevfs(mp); /* remove cache entries for this file sys */ + if (((mp->mnt_flag & MNT_RDONLY) || + (error = VFS_SYNC(mp, MNT_WAIT, p->p_ucred, p)) == 0) || + (flags & MNT_FORCE)) + error = VFS_UNMOUNT(mp, flags, p); + simple_lock(&mountlist_slock); + if (error) { + mp->mnt_flag &= ~MNT_UNMOUNT; + lockmgr(&mp->mnt_lock, LK_RELEASE | LK_INTERLOCK | LK_REENABLE, + &mountlist_slock, p); + return (error); + } + CIRCLEQ_REMOVE(&mountlist, mp, mnt_list); + if ((coveredvp = mp->mnt_vnodecovered) != NULLVP) { + coveredvp->v_mountedhere = (struct mount *)0; + vrele(coveredvp); + } + mp->mnt_vfc->vfc_refcount--; + if (mp->mnt_vnodelist.lh_first != NULL) + panic("unmount: dangling vnode"); + lockmgr(&mp->mnt_lock, LK_RELEASE | LK_INTERLOCK, &mountlist_slock, p); + if (mp->mnt_flag & MNT_MWAIT) + wakeup((caddr_t)mp); + free((caddr_t)mp, M_MOUNT); + return (0); +} + +/* + * Sync each mounted filesystem. + */ +#ifdef DEBUG +int syncprt = 0; +struct ctldebug debug0 = { "syncprt", &syncprt }; +#endif + +/* ARGSUSED */ +int +sync(p, uap, retval) + struct proc *p; + void *uap; + register_t *retval; +{ + register struct mount *mp, *nmp; + int asyncflag; + + simple_lock(&mountlist_slock); + for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = nmp) { + if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) { + nmp = mp->mnt_list.cqe_next; + continue; + } + if ((mp->mnt_flag & MNT_RDONLY) == 0) { + asyncflag = mp->mnt_flag & MNT_ASYNC; + mp->mnt_flag &= ~MNT_ASYNC; + VFS_SYNC(mp, MNT_NOWAIT, p->p_ucred, p); + if (asyncflag) + mp->mnt_flag |= MNT_ASYNC; + } + simple_lock(&mountlist_slock); + nmp = mp->mnt_list.cqe_next; + vfs_unbusy(mp, p); + } + simple_unlock(&mountlist_slock); +#ifdef DIAGNOSTIC + if (syncprt) + vfs_bufstats(); +#endif /* DIAGNOSTIC */ + return (0); +} + +/* + * Change filesystem quotas. + */ +/* ARGSUSED */ +int +quotactl(p, uap, retval) + struct proc *p; + register struct quotactl_args /* { + syscallarg(char *) path; + syscallarg(int) cmd; + syscallarg(int) uid; + syscallarg(caddr_t) arg; + } */ *uap; + register_t *retval; +{ + register struct mount *mp; + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p); + if (error = namei(&nd)) + return (error); + mp = nd.ni_vp->v_mount; + vrele(nd.ni_vp); + return (VFS_QUOTACTL(mp, SCARG(uap, cmd), SCARG(uap, uid), + SCARG(uap, arg), p)); +} + +/* + * Get filesystem statistics. + */ +/* ARGSUSED */ +int +statfs(p, uap, retval) + struct proc *p; + register struct statfs_args /* { + syscallarg(char *) path; + syscallarg(struct statfs *) buf; + } */ *uap; + register_t *retval; +{ + register struct mount *mp; + register struct statfs *sp; + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p); + if (error = namei(&nd)) + return (error); + mp = nd.ni_vp->v_mount; + sp = &mp->mnt_stat; + vrele(nd.ni_vp); + if (error = VFS_STATFS(mp, sp, p)) + return (error); + sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK; + return (copyout((caddr_t)sp, (caddr_t)SCARG(uap, buf), sizeof(*sp))); +} + +/* + * Get filesystem statistics. + */ +/* ARGSUSED */ +int +fstatfs(p, uap, retval) + struct proc *p; + register struct fstatfs_args /* { + syscallarg(int) fd; + syscallarg(struct statfs *) buf; + } */ *uap; + register_t *retval; +{ + struct file *fp; + struct mount *mp; + register struct statfs *sp; + int error; + + if (error = getvnode(p->p_fd, SCARG(uap, fd), &fp)) + return (error); + mp = ((struct vnode *)fp->f_data)->v_mount; + sp = &mp->mnt_stat; + if (error = VFS_STATFS(mp, sp, p)) + return (error); + sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK; + return (copyout((caddr_t)sp, (caddr_t)SCARG(uap, buf), sizeof(*sp))); +} + +/* + * Get statistics on all filesystems. + */ +int +getfsstat(p, uap, retval) + struct proc *p; + register struct getfsstat_args /* { + syscallarg(struct statfs *) buf; + syscallarg(long) bufsize; + syscallarg(int) flags; + } */ *uap; + register_t *retval; +{ + register struct mount *mp, *nmp; + register struct statfs *sp; + caddr_t sfsp; + long count, maxcount, error; + + maxcount = SCARG(uap, bufsize) / sizeof(struct statfs); + sfsp = (caddr_t)SCARG(uap, buf); + count = 0; + simple_lock(&mountlist_slock); + for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = nmp) { + if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) { + nmp = mp->mnt_list.cqe_next; + continue; + } + if (sfsp && count < maxcount) { + sp = &mp->mnt_stat; + /* + * If MNT_NOWAIT is specified, do not refresh the + * fsstat cache. MNT_WAIT overrides MNT_NOWAIT. + */ + if (((SCARG(uap, flags) & MNT_NOWAIT) == 0 || + (SCARG(uap, flags) & MNT_WAIT)) && + (error = VFS_STATFS(mp, sp, p))) { + simple_lock(&mountlist_slock); + nmp = mp->mnt_list.cqe_next; + vfs_unbusy(mp, p); + continue; + } + sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK; + if (error = copyout((caddr_t)sp, sfsp, sizeof(*sp))) + return (error); + sfsp += sizeof(*sp); + } + count++; + simple_lock(&mountlist_slock); + nmp = mp->mnt_list.cqe_next; + vfs_unbusy(mp, p); + } + simple_unlock(&mountlist_slock); + if (sfsp && count > maxcount) + *retval = maxcount; + else + *retval = count; + return (0); +} + +/* + * Change current working directory to a given file descriptor. + */ +/* ARGSUSED */ +int +fchdir(p, uap, retval) + struct proc *p; + struct fchdir_args /* { + syscallarg(int) fd; + } */ *uap; + register_t *retval; +{ + register struct filedesc *fdp = p->p_fd; + struct vnode *vp, *tdp; + struct mount *mp; + struct file *fp; + int error; + + if (error = getvnode(fdp, SCARG(uap, fd), &fp)) + return (error); + vp = (struct vnode *)fp->f_data; + VREF(vp); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); + if (vp->v_type != VDIR) + error = ENOTDIR; + else + error = VOP_ACCESS(vp, VEXEC, p->p_ucred, p); + while (!error && (mp = vp->v_mountedhere) != NULL) { + if (vfs_busy(mp, 0, 0, p)) + continue; + error = VFS_ROOT(mp, &tdp); + vfs_unbusy(mp, p); + if (error) + break; + vput(vp); + vp = tdp; + } + if (error) { + vput(vp); + return (error); + } + VOP_UNLOCK(vp, 0, p); + vrele(fdp->fd_cdir); + fdp->fd_cdir = vp; + return (0); +} + +/* + * Change current working directory (``.''). + */ +/* ARGSUSED */ +int +chdir(p, uap, retval) + struct proc *p; + struct chdir_args /* { + syscallarg(char *) path; + } */ *uap; + register_t *retval; +{ + register struct filedesc *fdp = p->p_fd; + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE, + SCARG(uap, path), p); + if (error = change_dir(&nd, p)) + return (error); + vrele(fdp->fd_cdir); + fdp->fd_cdir = nd.ni_vp; + return (0); +} + +/* + * Change notion of root (``/'') directory. + */ +/* ARGSUSED */ +int +chroot(p, uap, retval) + struct proc *p; + struct chroot_args /* { + syscallarg(char *) path; + } */ *uap; + register_t *retval; +{ + register struct filedesc *fdp = p->p_fd; + int error; + struct nameidata nd; + + if (error = suser(p->p_ucred, &p->p_acflag)) + return (error); + NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE, + SCARG(uap, path), p); + if (error = change_dir(&nd, p)) + return (error); + if (fdp->fd_rdir != NULL) + vrele(fdp->fd_rdir); + fdp->fd_rdir = nd.ni_vp; + return (0); +} + +/* + * Common routine for chroot and chdir. + */ +static int +change_dir(ndp, p) + register struct nameidata *ndp; + struct proc *p; +{ + struct vnode *vp; + int error; + + if (error = namei(ndp)) + return (error); + vp = ndp->ni_vp; + if (vp->v_type != VDIR) + error = ENOTDIR; + else + error = VOP_ACCESS(vp, VEXEC, p->p_ucred, p); + if (error) + vput(vp); + else + VOP_UNLOCK(vp, 0, p); + return (error); +} + +/* + * Check permissions, allocate an open file structure, + * and call the device open routine if any. + */ +int +open(p, uap, retval) + struct proc *p; + register struct open_args /* { + syscallarg(char *) path; + syscallarg(int) flags; + syscallarg(int) mode; + } */ *uap; + register_t *retval; +{ + register struct filedesc *fdp = p->p_fd; + register struct file *fp; + register struct vnode *vp; + int flags, cmode; + struct file *nfp; + int type, indx, error; + struct flock lf; + struct nameidata nd; + extern struct fileops vnops; + + if (error = falloc(p, &nfp, &indx)) + return (error); + fp = nfp; + flags = FFLAGS(SCARG(uap, flags)); + cmode = ((SCARG(uap, mode) &~ fdp->fd_cmask) & ALLPERMS) &~ S_ISTXT; + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p); + p->p_dupfd = -indx - 1; /* XXX check for fdopen */ + if (error = vn_open(&nd, flags, cmode)) { + ffree(fp); + if ((error == ENODEV || error == ENXIO) && + p->p_dupfd >= 0 && /* XXX from fdopen */ + (error = + dupfdopen(fdp, indx, p->p_dupfd, flags, error)) == 0) { + *retval = indx; + return (0); + } + if (error == ERESTART) + error = EINTR; + fdp->fd_ofiles[indx] = NULL; + return (error); + } + p->p_dupfd = 0; + vp = nd.ni_vp; + fp->f_flag = flags & FMASK; + fp->f_type = DTYPE_VNODE; + fp->f_ops = &vnops; + fp->f_data = (caddr_t)vp; + if (flags & (O_EXLOCK | O_SHLOCK)) { + lf.l_whence = SEEK_SET; + lf.l_start = 0; + lf.l_len = 0; + if (flags & O_EXLOCK) + lf.l_type = F_WRLCK; + else + lf.l_type = F_RDLCK; + type = F_FLOCK; + if ((flags & FNONBLOCK) == 0) + type |= F_WAIT; + VOP_UNLOCK(vp, 0, p); + if (error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, type)) { + (void) vn_close(vp, fp->f_flag, fp->f_cred, p); + ffree(fp); + fdp->fd_ofiles[indx] = NULL; + return (error); + } + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); + fp->f_flag |= FHASLOCK; + } + VOP_UNLOCK(vp, 0, p); + *retval = indx; + return (0); +} + +#ifdef COMPAT_43 +/* + * Create a file. + */ +int +compat_43_creat(p, uap, retval) + struct proc *p; + register struct compat_43_creat_args /* { + syscallarg(char *) path; + syscallarg(int) mode; + } */ *uap; + register_t *retval; +{ + struct open_args /* { + syscallarg(char *) path; + syscallarg(int) flags; + syscallarg(int) mode; + } */ nuap; + + SCARG(&nuap, path) = SCARG(uap, path); + SCARG(&nuap, mode) = SCARG(uap, mode); + SCARG(&nuap, flags) = O_WRONLY | O_CREAT | O_TRUNC; + return (open(p, &nuap, retval)); +} +#endif /* COMPAT_43 */ + +/* + * Create a special file. + */ +/* ARGSUSED */ +int +mknod(p, uap, retval) + struct proc *p; + register struct mknod_args /* { + syscallarg(char *) path; + syscallarg(int) mode; + syscallarg(int) dev; + } */ *uap; + register_t *retval; +{ + register struct vnode *vp; + struct vattr vattr; + int error; + int whiteout; + struct nameidata nd; + + if (error = suser(p->p_ucred, &p->p_acflag)) + return (error); + NDINIT(&nd, CREATE, LOCKPARENT, UIO_USERSPACE, SCARG(uap, path), p); + if (error = namei(&nd)) + return (error); + vp = nd.ni_vp; + if (vp != NULL) + error = EEXIST; + else { + VATTR_NULL(&vattr); + vattr.va_mode = (SCARG(uap, mode) & ALLPERMS) &~ p->p_fd->fd_cmask; + vattr.va_rdev = SCARG(uap, dev); + whiteout = 0; + + switch (SCARG(uap, mode) & S_IFMT) { + case S_IFMT: /* used by badsect to flag bad sectors */ + vattr.va_type = VBAD; + break; + case S_IFCHR: + vattr.va_type = VCHR; + break; + case S_IFBLK: + vattr.va_type = VBLK; + break; + case S_IFWHT: + whiteout = 1; + break; + default: + error = EINVAL; + break; + } + } + if (!error) { + VOP_LEASE(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE); + if (whiteout) { + error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, CREATE); + if (error) + VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); + vput(nd.ni_dvp); + } else { + error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, + &nd.ni_cnd, &vattr); + } + } else { + VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); + if (nd.ni_dvp == vp) + vrele(nd.ni_dvp); + else + vput(nd.ni_dvp); + if (vp) + vrele(vp); + } + return (error); +} + +/* + * Create a named pipe. + */ +/* ARGSUSED */ +int +mkfifo(p, uap, retval) + struct proc *p; + register struct mkfifo_args /* { + syscallarg(char *) path; + syscallarg(int) mode; + } */ *uap; + register_t *retval; +{ + struct vattr vattr; + int error; + struct nameidata nd; + +#ifndef FIFO + return (EOPNOTSUPP); +#else + NDINIT(&nd, CREATE, LOCKPARENT, UIO_USERSPACE, SCARG(uap, path), p); + if (error = namei(&nd)) + return (error); + if (nd.ni_vp != NULL) { + VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); + if (nd.ni_dvp == nd.ni_vp) + vrele(nd.ni_dvp); + else + vput(nd.ni_dvp); + vrele(nd.ni_vp); + return (EEXIST); + } + VATTR_NULL(&vattr); + vattr.va_type = VFIFO; + vattr.va_mode = (SCARG(uap, mode) & ALLPERMS) &~ p->p_fd->fd_cmask; + VOP_LEASE(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE); + return (VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr)); +#endif /* FIFO */ +} + +/* + * Make a hard file link. + */ +/* ARGSUSED */ +int +link(p, uap, retval) + struct proc *p; + register struct link_args /* { + syscallarg(char *) path; + syscallarg(char *) link; + } */ *uap; + register_t *retval; +{ + register struct vnode *vp; + struct nameidata nd; + int error; + + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p); + if (error = namei(&nd)) + return (error); + vp = nd.ni_vp; + if (vp->v_type != VDIR || + (error = suser(p->p_ucred, &p->p_acflag)) == 0) { + nd.ni_cnd.cn_nameiop = CREATE; + nd.ni_cnd.cn_flags = LOCKPARENT; + nd.ni_dirp = SCARG(uap, link); + if ((error = namei(&nd)) == 0) { + if (nd.ni_vp != NULL) + error = EEXIST; + if (!error) { + VOP_LEASE(nd.ni_dvp, p, p->p_ucred, + LEASE_WRITE); + VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE); + error = VOP_LINK(vp, nd.ni_dvp, &nd.ni_cnd); + } else { + VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); + if (nd.ni_dvp == nd.ni_vp) + vrele(nd.ni_dvp); + else + vput(nd.ni_dvp); + if (nd.ni_vp) + vrele(nd.ni_vp); + } + } + } + vrele(vp); + return (error); +} + +/* + * Make a symbolic link. + */ +/* ARGSUSED */ +int +symlink(p, uap, retval) + struct proc *p; + register struct symlink_args /* { + syscallarg(char *) path; + syscallarg(char *) link; + } */ *uap; + register_t *retval; +{ + struct vattr vattr; + char *path; + int error; + struct nameidata nd; + + MALLOC(path, char *, MAXPATHLEN, M_NAMEI, M_WAITOK); + if (error = copyinstr(SCARG(uap, path), path, MAXPATHLEN, NULL)) + goto out; + NDINIT(&nd, CREATE, LOCKPARENT, UIO_USERSPACE, SCARG(uap, link), p); + if (error = namei(&nd)) + goto out; + if (nd.ni_vp) { + VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); + if (nd.ni_dvp == nd.ni_vp) + vrele(nd.ni_dvp); + else + vput(nd.ni_dvp); + vrele(nd.ni_vp); + error = EEXIST; + goto out; + } + VATTR_NULL(&vattr); + vattr.va_mode = ACCESSPERMS &~ p->p_fd->fd_cmask; + VOP_LEASE(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE); + error = VOP_SYMLINK(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr, path); +out: + FREE(path, M_NAMEI); + return (error); +} + +/* + * Delete a whiteout from the filesystem. + */ +/* ARGSUSED */ +int +undelete(p, uap, retval) + struct proc *p; + register struct undelete_args /* { + syscallarg(char *) path; + } */ *uap; + register_t *retval; +{ + int error; + struct nameidata nd; + + NDINIT(&nd, DELETE, LOCKPARENT|DOWHITEOUT, UIO_USERSPACE, + SCARG(uap, path), p); + error = namei(&nd); + if (error) + return (error); + + if (nd.ni_vp != NULLVP || !(nd.ni_cnd.cn_flags & ISWHITEOUT)) { + VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); + if (nd.ni_dvp == nd.ni_vp) + vrele(nd.ni_dvp); + else + vput(nd.ni_dvp); + if (nd.ni_vp) + vrele(nd.ni_vp); + return (EEXIST); + } + + VOP_LEASE(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE); + if (error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, DELETE)) + VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); + vput(nd.ni_dvp); + return (error); +} + +/* + * Delete a name from the filesystem. + */ +/* ARGSUSED */ +int +unlink(p, uap, retval) + struct proc *p; + struct unlink_args /* { + syscallarg(char *) path; + } */ *uap; + register_t *retval; +{ + register struct vnode *vp; + int error; + struct nameidata nd; + + NDINIT(&nd, DELETE, LOCKPARENT, UIO_USERSPACE, SCARG(uap, path), p); + if (error = namei(&nd)) + return (error); + vp = nd.ni_vp; + VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); + + if (vp->v_type != VDIR || + (error = suser(p->p_ucred, &p->p_acflag)) == 0) { + /* + * The root of a mounted filesystem cannot be deleted. + */ + if (vp->v_flag & VROOT) + error = EBUSY; + else + (void)vnode_pager_uncache(vp); + } + + if (!error) { + VOP_LEASE(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE); + error = VOP_REMOVE(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd); + } else { + VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); + if (nd.ni_dvp == vp) + vrele(nd.ni_dvp); + else + vput(nd.ni_dvp); + if (vp != NULLVP) + vput(vp); + } + return (error); +} + +/* + * Reposition read/write file offset. + */ +int +lseek(p, uap, retval) + struct proc *p; + register struct lseek_args /* { + syscallarg(int) fd; + syscallarg(int) pad; + syscallarg(off_t) offset; + syscallarg(int) whence; + } */ *uap; + register_t *retval; +{ + struct ucred *cred = p->p_ucred; + register struct filedesc *fdp = p->p_fd; + register struct file *fp; + struct vattr vattr; + int error; + + if ((u_int)SCARG(uap, fd) >= fdp->fd_nfiles || + (fp = fdp->fd_ofiles[SCARG(uap, fd)]) == NULL) + return (EBADF); + if (fp->f_type != DTYPE_VNODE) + return (ESPIPE); + switch (SCARG(uap, whence)) { + case L_INCR: + fp->f_offset += SCARG(uap, offset); + break; + case L_XTND: + if (error = + VOP_GETATTR((struct vnode *)fp->f_data, &vattr, cred, p)) + return (error); + fp->f_offset = SCARG(uap, offset) + vattr.va_size; + break; + case L_SET: + fp->f_offset = SCARG(uap, offset); + break; + default: + return (EINVAL); + } + *(off_t *)retval = fp->f_offset; + return (0); +} + +#if defined(COMPAT_43) || defined(COMPAT_SUNOS) +/* + * Reposition read/write file offset. + */ +int +compat_43_lseek(p, uap, retval) + struct proc *p; + register struct compat_43_lseek_args /* { + syscallarg(int) fd; + syscallarg(long) offset; + syscallarg(int) whence; + } */ *uap; + register_t *retval; +{ + struct lseek_args /* { + syscallarg(int) fd; + syscallarg(int) pad; + syscallarg(off_t) offset; + syscallarg(int) whence; + } */ nuap; + off_t qret; + int error; + + SCARG(&nuap, fd) = SCARG(uap, fd); + SCARG(&nuap, offset) = SCARG(uap, offset); + SCARG(&nuap, whence) = SCARG(uap, whence); + error = lseek(p, &nuap, &qret); + *(long *)retval = qret; + return (error); +} +#endif /* COMPAT_43 */ + +/* + * Check access permissions. + */ +int +access(p, uap, retval) + struct proc *p; + register struct access_args /* { + syscallarg(char *) path; + syscallarg(int) flags; + } */ *uap; + register_t *retval; +{ + register struct ucred *cred = p->p_ucred; + register struct vnode *vp; + int error, flags, t_gid, t_uid; + struct nameidata nd; + + t_uid = cred->cr_uid; + t_gid = cred->cr_groups[0]; + cred->cr_uid = p->p_cred->p_ruid; + cred->cr_groups[0] = p->p_cred->p_rgid; + NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE, + SCARG(uap, path), p); + if (error = namei(&nd)) + goto out1; + vp = nd.ni_vp; + + /* Flags == 0 means only check for existence. */ + if (SCARG(uap, flags)) { + flags = 0; + if (SCARG(uap, flags) & R_OK) + flags |= VREAD; + if (SCARG(uap, flags) & W_OK) + flags |= VWRITE; + if (SCARG(uap, flags) & X_OK) + flags |= VEXEC; + if ((flags & VWRITE) == 0 || (error = vn_writechk(vp)) == 0) + error = VOP_ACCESS(vp, flags, cred, p); + } + vput(vp); +out1: + cred->cr_uid = t_uid; + cred->cr_groups[0] = t_gid; + return (error); +} + +#if defined(COMPAT_43) || defined(COMPAT_SUNOS) +/* + * Get file status; this version follows links. + */ +/* ARGSUSED */ +int +compat_43_stat(p, uap, retval) + struct proc *p; + register struct compat_43_stat_args /* { + syscallarg(char *) path; + syscallarg(struct ostat *) ub; + } */ *uap; + register_t *retval; +{ + struct stat sb; + struct ostat osb; + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE, + SCARG(uap, path), p); + if (error = namei(&nd)) + return (error); + error = vn_stat(nd.ni_vp, &sb, p); + vput(nd.ni_vp); + if (error) + return (error); + cvtstat(&sb, &osb); + error = copyout((caddr_t)&osb, (caddr_t)SCARG(uap, ub), sizeof (osb)); + return (error); +} + +/* + * Get file status; this version does not follow links. + */ +/* ARGSUSED */ +int +compat_43_lstat(p, uap, retval) + struct proc *p; + register struct compat_43_lstat_args /* { + syscallarg(char *) path; + syscallarg(struct ostat *) ub; + } */ *uap; + register_t *retval; +{ + struct vnode *vp, *dvp; + struct stat sb, sb1; + struct ostat osb; + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | LOCKPARENT, UIO_USERSPACE, + SCARG(uap, path), p); + if (error = namei(&nd)) + return (error); + /* + * For symbolic links, always return the attributes of its + * containing directory, except for mode, size, and links. + */ + vp = nd.ni_vp; + dvp = nd.ni_dvp; + if (vp->v_type != VLNK) { + if (dvp == vp) + vrele(dvp); + else + vput(dvp); + error = vn_stat(vp, &sb, p); + vput(vp); + if (error) + return (error); + } else { + error = vn_stat(dvp, &sb, p); + vput(dvp); + if (error) { + vput(vp); + return (error); + } + error = vn_stat(vp, &sb1, p); + vput(vp); + if (error) + return (error); + sb.st_mode &= ~S_IFDIR; + sb.st_mode |= S_IFLNK; + sb.st_nlink = sb1.st_nlink; + sb.st_size = sb1.st_size; + sb.st_blocks = sb1.st_blocks; + } + cvtstat(&sb, &osb); + error = copyout((caddr_t)&osb, (caddr_t)SCARG(uap, ub), sizeof (osb)); + return (error); +} + +/* + * Convert from an old to a new stat structure. + */ +void +cvtstat(st, ost) + struct stat *st; + struct ostat *ost; +{ + + ost->st_dev = st->st_dev; + ost->st_ino = st->st_ino; + ost->st_mode = st->st_mode; + ost->st_nlink = st->st_nlink; + ost->st_uid = st->st_uid; + ost->st_gid = st->st_gid; + ost->st_rdev = st->st_rdev; + if (st->st_size < (quad_t)1 << 32) + ost->st_size = st->st_size; + else + ost->st_size = -2; + ost->st_atime = st->st_atime; + ost->st_mtime = st->st_mtime; + ost->st_ctime = st->st_ctime; + ost->st_blksize = st->st_blksize; + ost->st_blocks = st->st_blocks; + ost->st_flags = st->st_flags; + ost->st_gen = st->st_gen; +} +#endif /* COMPAT_43 || COMPAT_SUNOS */ + +/* + * Get file status; this version follows links. + */ +/* ARGSUSED */ +int +stat(p, uap, retval) + struct proc *p; + register struct stat_args /* { + syscallarg(char *) path; + syscallarg(struct stat *) ub; + } */ *uap; + register_t *retval; +{ + struct stat sb; + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE, + SCARG(uap, path), p); + if (error = namei(&nd)) + return (error); + error = vn_stat(nd.ni_vp, &sb, p); + vput(nd.ni_vp); + if (error) + return (error); + error = copyout((caddr_t)&sb, (caddr_t)SCARG(uap, ub), sizeof (sb)); + return (error); +} + +/* + * Get file status; this version does not follow links. + */ +/* ARGSUSED */ +int +lstat(p, uap, retval) + struct proc *p; + register struct lstat_args /* { + syscallarg(char *) path; + syscallarg(struct stat *) ub; + } */ *uap; + register_t *retval; +{ + int error; + struct vnode *vp, *dvp; + struct stat sb, sb1; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | LOCKPARENT, UIO_USERSPACE, + SCARG(uap, path), p); + if (error = namei(&nd)) + return (error); + /* + * For symbolic links, always return the attributes of its containing + * directory, except for mode, size, inode number, and links. + */ + vp = nd.ni_vp; + dvp = nd.ni_dvp; + if (vp->v_type != VLNK) { + if (dvp == vp) + vrele(dvp); + else + vput(dvp); + error = vn_stat(vp, &sb, p); + vput(vp); + if (error) + return (error); + } else { + error = vn_stat(dvp, &sb, p); + vput(dvp); + if (error) { + vput(vp); + return (error); + } + error = vn_stat(vp, &sb1, p); + vput(vp); + if (error) + return (error); + sb.st_mode &= ~S_IFDIR; + sb.st_mode |= S_IFLNK; + sb.st_nlink = sb1.st_nlink; + sb.st_size = sb1.st_size; + sb.st_blocks = sb1.st_blocks; + sb.st_ino = sb1.st_ino; + } + error = copyout((caddr_t)&sb, (caddr_t)SCARG(uap, ub), sizeof (sb)); + return (error); +} + +/* + * Get configurable pathname variables. + */ +/* ARGSUSED */ +int +pathconf(p, uap, retval) + struct proc *p; + register struct pathconf_args /* { + syscallarg(char *) path; + syscallarg(int) name; + } */ *uap; + register_t *retval; +{ + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE, + SCARG(uap, path), p); + if (error = namei(&nd)) + return (error); + error = VOP_PATHCONF(nd.ni_vp, SCARG(uap, name), retval); + vput(nd.ni_vp); + return (error); +} + +/* + * Return target name of a symbolic link. + */ +/* ARGSUSED */ +int +readlink(p, uap, retval) + struct proc *p; + register struct readlink_args /* { + syscallarg(char *) path; + syscallarg(char *) buf; + syscallarg(int) count; + } */ *uap; + register_t *retval; +{ + register struct vnode *vp; + struct iovec aiov; + struct uio auio; + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF, UIO_USERSPACE, + SCARG(uap, path), p); + if (error = namei(&nd)) + return (error); + vp = nd.ni_vp; + if (vp->v_type != VLNK) + error = EINVAL; + else { + aiov.iov_base = SCARG(uap, buf); + aiov.iov_len = SCARG(uap, count); + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_offset = 0; + auio.uio_rw = UIO_READ; + auio.uio_segflg = UIO_USERSPACE; + auio.uio_procp = p; + auio.uio_resid = SCARG(uap, count); + error = VOP_READLINK(vp, &auio, p->p_ucred); + } + vput(vp); + *retval = SCARG(uap, count) - auio.uio_resid; + return (error); +} + +/* + * Change flags of a file given a path name. + */ +/* ARGSUSED */ +int +chflags(p, uap, retval) + struct proc *p; + register struct chflags_args /* { + syscallarg(char *) path; + syscallarg(int) flags; + } */ *uap; + register_t *retval; +{ + register struct vnode *vp; + struct vattr vattr; + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p); + if (error = namei(&nd)) + return (error); + vp = nd.ni_vp; + VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); + VATTR_NULL(&vattr); + vattr.va_flags = SCARG(uap, flags); + error = VOP_SETATTR(vp, &vattr, p->p_ucred, p); + vput(vp); + return (error); +} + +/* + * Change flags of a file given a file descriptor. + */ +/* ARGSUSED */ +int +fchflags(p, uap, retval) + struct proc *p; + register struct fchflags_args /* { + syscallarg(int) fd; + syscallarg(int) flags; + } */ *uap; + register_t *retval; +{ + struct vattr vattr; + struct vnode *vp; + struct file *fp; + int error; + + if (error = getvnode(p->p_fd, SCARG(uap, fd), &fp)) + return (error); + vp = (struct vnode *)fp->f_data; + VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); + VATTR_NULL(&vattr); + vattr.va_flags = SCARG(uap, flags); + error = VOP_SETATTR(vp, &vattr, p->p_ucred, p); + VOP_UNLOCK(vp, 0, p); + return (error); +} + +/* + * Change mode of a file given path name. + */ +/* ARGSUSED */ +int +chmod(p, uap, retval) + struct proc *p; + register struct chmod_args /* { + syscallarg(char *) path; + syscallarg(int) mode; + } */ *uap; + register_t *retval; +{ + register struct vnode *vp; + struct vattr vattr; + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p); + if (error = namei(&nd)) + return (error); + vp = nd.ni_vp; + VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); + VATTR_NULL(&vattr); + vattr.va_mode = SCARG(uap, mode) & ALLPERMS; + error = VOP_SETATTR(vp, &vattr, p->p_ucred, p); + vput(vp); + return (error); +} + +/* + * Change mode of a file given a file descriptor. + */ +/* ARGSUSED */ +int +fchmod(p, uap, retval) + struct proc *p; + register struct fchmod_args /* { + syscallarg(int) fd; + syscallarg(int) mode; + } */ *uap; + register_t *retval; +{ + struct vattr vattr; + struct vnode *vp; + struct file *fp; + int error; + + if (error = getvnode(p->p_fd, SCARG(uap, fd), &fp)) + return (error); + vp = (struct vnode *)fp->f_data; + VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); + VATTR_NULL(&vattr); + vattr.va_mode = SCARG(uap, mode) & ALLPERMS; + error = VOP_SETATTR(vp, &vattr, p->p_ucred, p); + VOP_UNLOCK(vp, 0, p); + return (error); +} + +/* + * Set ownership given a path name. + */ +/* ARGSUSED */ +int +chown(p, uap, retval) + struct proc *p; + register struct chown_args /* { + syscallarg(char *) path; + syscallarg(int) uid; + syscallarg(int) gid; + } */ *uap; + register_t *retval; +{ + register struct vnode *vp; + struct vattr vattr; + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p); + if (error = namei(&nd)) + return (error); + vp = nd.ni_vp; + VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); + VATTR_NULL(&vattr); + vattr.va_uid = SCARG(uap, uid); + vattr.va_gid = SCARG(uap, gid); + error = VOP_SETATTR(vp, &vattr, p->p_ucred, p); + vput(vp); + return (error); +} + +/* + * Set ownership given a file descriptor. + */ +/* ARGSUSED */ +int +fchown(p, uap, retval) + struct proc *p; + register struct fchown_args /* { + syscallarg(int) fd; + syscallarg(int) uid; + syscallarg(int) gid; + } */ *uap; + register_t *retval; +{ + struct vattr vattr; + struct vnode *vp; + struct file *fp; + int error; + + if (error = getvnode(p->p_fd, SCARG(uap, fd), &fp)) + return (error); + vp = (struct vnode *)fp->f_data; + VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); + VATTR_NULL(&vattr); + vattr.va_uid = SCARG(uap, uid); + vattr.va_gid = SCARG(uap, gid); + error = VOP_SETATTR(vp, &vattr, p->p_ucred, p); + VOP_UNLOCK(vp, 0, p); + return (error); +} + +/* + * Set the access and modification times of a file. + */ +/* ARGSUSED */ +int +utimes(p, uap, retval) + struct proc *p; + register struct utimes_args /* { + syscallarg(char *) path; + syscallarg(struct timeval *) tptr; + } */ *uap; + register_t *retval; +{ + register struct vnode *vp; + struct timeval tv[2]; + struct vattr vattr; + int error; + struct nameidata nd; + + VATTR_NULL(&vattr); + if (SCARG(uap, tptr) == NULL) { + microtime(&tv[0]); + tv[1] = tv[0]; + vattr.va_vaflags |= VA_UTIMES_NULL; + } else if (error = copyin((caddr_t)SCARG(uap, tptr), (caddr_t)tv, + sizeof (tv))) + return (error); + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p); + if (error = namei(&nd)) + return (error); + vp = nd.ni_vp; + VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); + vattr.va_atime.ts_sec = tv[0].tv_sec; + vattr.va_atime.ts_nsec = tv[0].tv_usec * 1000; + vattr.va_mtime.ts_sec = tv[1].tv_sec; + vattr.va_mtime.ts_nsec = tv[1].tv_usec * 1000; + error = VOP_SETATTR(vp, &vattr, p->p_ucred, p); + vput(vp); + return (error); +} + +/* + * Truncate a file given its path name. + */ +/* ARGSUSED */ +int +truncate(p, uap, retval) + struct proc *p; + register struct truncate_args /* { + syscallarg(char *) path; + syscallarg(int) pad; + syscallarg(off_t) length; + } */ *uap; + register_t *retval; +{ + register struct vnode *vp; + struct vattr vattr; + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p); + if (error = namei(&nd)) + return (error); + vp = nd.ni_vp; + VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); + if (vp->v_type == VDIR) + error = EISDIR; + else if ((error = vn_writechk(vp)) == 0 && + (error = VOP_ACCESS(vp, VWRITE, p->p_ucred, p)) == 0) { + VATTR_NULL(&vattr); + vattr.va_size = SCARG(uap, length); + error = VOP_SETATTR(vp, &vattr, p->p_ucred, p); + } + vput(vp); + return (error); +} + +/* + * Truncate a file given a file descriptor. + */ +/* ARGSUSED */ +int +ftruncate(p, uap, retval) + struct proc *p; + register struct ftruncate_args /* { + syscallarg(int) fd; + syscallarg(int) pad; + syscallarg(off_t) length; + } */ *uap; + register_t *retval; +{ + struct vattr vattr; + struct vnode *vp; + struct file *fp; + int error; + + if (error = getvnode(p->p_fd, SCARG(uap, fd), &fp)) + return (error); + if ((fp->f_flag & FWRITE) == 0) + return (EINVAL); + vp = (struct vnode *)fp->f_data; + VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); + if (vp->v_type == VDIR) + error = EISDIR; + else if ((error = vn_writechk(vp)) == 0) { + VATTR_NULL(&vattr); + vattr.va_size = SCARG(uap, length); + error = VOP_SETATTR(vp, &vattr, fp->f_cred, p); + } + VOP_UNLOCK(vp, 0, p); + return (error); +} + +#if defined(COMPAT_43) || defined(COMPAT_SUNOS) +/* + * Truncate a file given its path name. + */ +/* ARGSUSED */ +int +compat_43_truncate(p, uap, retval) + struct proc *p; + register struct compat_43_truncate_args /* { + syscallarg(char *) path; + syscallarg(long) length; + } */ *uap; + register_t *retval; +{ + struct truncate_args /* { + syscallarg(char *) path; + syscallarg(int) pad; + syscallarg(off_t) length; + } */ nuap; + + SCARG(&nuap, path) = SCARG(uap, path); + SCARG(&nuap, length) = SCARG(uap, length); + return (truncate(p, &nuap, retval)); +} + +/* + * Truncate a file given a file descriptor. + */ +/* ARGSUSED */ +int +compat_43_ftruncate(p, uap, retval) + struct proc *p; + register struct compat_43_ftruncate_args /* { + syscallarg(int) fd; + syscallarg(long) length; + } */ *uap; + register_t *retval; +{ + struct ftruncate_args /* { + syscallarg(int) fd; + syscallarg(int) pad; + syscallarg(off_t) length; + } */ nuap; + + SCARG(&nuap, fd) = SCARG(uap, fd); + SCARG(&nuap, length) = SCARG(uap, length); + return (ftruncate(p, &nuap, retval)); +} +#endif /* COMPAT_43 || COMPAT_SUNOS */ + +/* + * Sync an open file. + */ +/* ARGSUSED */ +int +fsync(p, uap, retval) + struct proc *p; + struct fsync_args /* { + syscallarg(int) fd; + } */ *uap; + register_t *retval; +{ + register struct vnode *vp; + struct file *fp; + int error; + + if (error = getvnode(p->p_fd, SCARG(uap, fd), &fp)) + return (error); + vp = (struct vnode *)fp->f_data; + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); + error = VOP_FSYNC(vp, fp->f_cred, MNT_WAIT, p); + VOP_UNLOCK(vp, 0, p); + return (error); +} + +/* + * Rename files. Source and destination must either both be directories, + * or both not be directories. If target is a directory, it must be empty. + */ +/* ARGSUSED */ +int +rename(p, uap, retval) + struct proc *p; + register struct rename_args /* { + syscallarg(char *) from; + syscallarg(char *) to; + } */ *uap; + register_t *retval; +{ + register struct vnode *tvp, *fvp, *tdvp; + struct nameidata fromnd, tond; + int error; + + NDINIT(&fromnd, DELETE, WANTPARENT | SAVESTART, UIO_USERSPACE, + SCARG(uap, from), p); + if (error = namei(&fromnd)) + return (error); + fvp = fromnd.ni_vp; + NDINIT(&tond, RENAME, LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART, + UIO_USERSPACE, SCARG(uap, to), p); + if (error = namei(&tond)) { + VOP_ABORTOP(fromnd.ni_dvp, &fromnd.ni_cnd); + vrele(fromnd.ni_dvp); + vrele(fvp); + goto out1; + } + tdvp = tond.ni_dvp; + tvp = tond.ni_vp; + if (tvp != NULL) { + if (fvp->v_type == VDIR && tvp->v_type != VDIR) { + error = ENOTDIR; + goto out; + } else if (fvp->v_type != VDIR && tvp->v_type == VDIR) { + error = EISDIR; + goto out; + } + } + if (fvp == tdvp) + error = EINVAL; + /* + * If source is the same as the destination (that is the + * same inode number with the same name in the same directory), + * then there is nothing to do. + */ + if (fvp == tvp && fromnd.ni_dvp == tdvp && + fromnd.ni_cnd.cn_namelen == tond.ni_cnd.cn_namelen && + !bcmp(fromnd.ni_cnd.cn_nameptr, tond.ni_cnd.cn_nameptr, + fromnd.ni_cnd.cn_namelen)) + error = -1; +out: + if (!error) { + VOP_LEASE(tdvp, p, p->p_ucred, LEASE_WRITE); + if (fromnd.ni_dvp != tdvp) + VOP_LEASE(fromnd.ni_dvp, p, p->p_ucred, LEASE_WRITE); + if (tvp) + VOP_LEASE(tvp, p, p->p_ucred, LEASE_WRITE); + error = VOP_RENAME(fromnd.ni_dvp, fromnd.ni_vp, &fromnd.ni_cnd, + tond.ni_dvp, tond.ni_vp, &tond.ni_cnd); + } else { + VOP_ABORTOP(tond.ni_dvp, &tond.ni_cnd); + if (tdvp == tvp) + vrele(tdvp); + else + vput(tdvp); + if (tvp) + vput(tvp); + VOP_ABORTOP(fromnd.ni_dvp, &fromnd.ni_cnd); + vrele(fromnd.ni_dvp); + vrele(fvp); + } + vrele(tond.ni_startdir); + FREE(tond.ni_cnd.cn_pnbuf, M_NAMEI); +out1: + if (fromnd.ni_startdir) + vrele(fromnd.ni_startdir); + FREE(fromnd.ni_cnd.cn_pnbuf, M_NAMEI); + if (error == -1) + return (0); + return (error); +} + +/* + * Make a directory file. + */ +/* ARGSUSED */ +int +mkdir(p, uap, retval) + struct proc *p; + register struct mkdir_args /* { + syscallarg(char *) path; + syscallarg(int) mode; + } */ *uap; + register_t *retval; +{ + register struct vnode *vp; + struct vattr vattr; + int error; + struct nameidata nd; + + NDINIT(&nd, CREATE, LOCKPARENT, UIO_USERSPACE, SCARG(uap, path), p); + if (error = namei(&nd)) + return (error); + vp = nd.ni_vp; + if (vp != NULL) { + VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); + if (nd.ni_dvp == vp) + vrele(nd.ni_dvp); + else + vput(nd.ni_dvp); + vrele(vp); + return (EEXIST); + } + VATTR_NULL(&vattr); + vattr.va_type = VDIR; + vattr.va_mode = (SCARG(uap, mode) & ACCESSPERMS) &~ p->p_fd->fd_cmask; + VOP_LEASE(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE); + error = VOP_MKDIR(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr); + if (!error) + vput(nd.ni_vp); + return (error); +} + +/* + * Remove a directory file. + */ +/* ARGSUSED */ +int +rmdir(p, uap, retval) + struct proc *p; + struct rmdir_args /* { + syscallarg(char *) path; + } */ *uap; + register_t *retval; +{ + register struct vnode *vp; + int error; + struct nameidata nd; + + NDINIT(&nd, DELETE, LOCKPARENT | LOCKLEAF, UIO_USERSPACE, + SCARG(uap, path), p); + if (error = namei(&nd)) + return (error); + vp = nd.ni_vp; + if (vp->v_type != VDIR) { + error = ENOTDIR; + goto out; + } + /* + * No rmdir "." please. + */ + if (nd.ni_dvp == vp) { + error = EINVAL; + goto out; + } + /* + * The root of a mounted filesystem cannot be deleted. + */ + if (vp->v_flag & VROOT) + error = EBUSY; +out: + if (!error) { + VOP_LEASE(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE); + VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE); + error = VOP_RMDIR(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd); + } else { + VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); + if (nd.ni_dvp == vp) + vrele(nd.ni_dvp); + else + vput(nd.ni_dvp); + vput(vp); + } + return (error); +} + +#ifdef COMPAT_43 +/* + * Read a block of directory entries in a file system independent format. + */ +int +compat_43_getdirentries(p, uap, retval) + struct proc *p; + register struct compat_43_getdirentries_args /* { + syscallarg(int) fd; + syscallarg(char *) buf; + syscallarg(u_int) count; + syscallarg(long *) basep; + } */ *uap; + register_t *retval; +{ + register struct vnode *vp; + struct file *fp; + struct uio auio, kuio; + struct iovec aiov, kiov; + struct dirent *dp, *edp; + caddr_t dirbuf; + int error, eofflag, readcnt; + long loff; + + if (error = getvnode(p->p_fd, SCARG(uap, fd), &fp)) + return (error); + if ((fp->f_flag & FREAD) == 0) + return (EBADF); + vp = (struct vnode *)fp->f_data; +unionread: + if (vp->v_type != VDIR) + return (EINVAL); + aiov.iov_base = SCARG(uap, buf); + aiov.iov_len = SCARG(uap, count); + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_rw = UIO_READ; + auio.uio_segflg = UIO_USERSPACE; + auio.uio_procp = p; + auio.uio_resid = SCARG(uap, count); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); + loff = auio.uio_offset = fp->f_offset; +# if (BYTE_ORDER != LITTLE_ENDIAN) + if (vp->v_mount->mnt_maxsymlinklen <= 0) { + error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, + (int *)0, (u_long *)0); + fp->f_offset = auio.uio_offset; + } else +# endif + { + kuio = auio; + kuio.uio_iov = &kiov; + kuio.uio_segflg = UIO_SYSSPACE; + kiov.iov_len = SCARG(uap, count); + MALLOC(dirbuf, caddr_t, SCARG(uap, count), M_TEMP, M_WAITOK); + kiov.iov_base = dirbuf; + error = VOP_READDIR(vp, &kuio, fp->f_cred, &eofflag, + (int *)0, (u_long *)0); + fp->f_offset = kuio.uio_offset; + if (error == 0) { + readcnt = SCARG(uap, count) - kuio.uio_resid; + edp = (struct dirent *)&dirbuf[readcnt]; + for (dp = (struct dirent *)dirbuf; dp < edp; ) { +# if (BYTE_ORDER == LITTLE_ENDIAN) + /* + * The expected low byte of + * dp->d_namlen is our dp->d_type. + * The high MBZ byte of dp->d_namlen + * is our dp->d_namlen. + */ + dp->d_type = dp->d_namlen; + dp->d_namlen = 0; +# else + /* + * The dp->d_type is the high byte + * of the expected dp->d_namlen, + * so must be zero'ed. + */ + dp->d_type = 0; +# endif + if (dp->d_reclen > 0) { + dp = (struct dirent *) + ((char *)dp + dp->d_reclen); + } else { + error = EIO; + break; + } + } + if (dp >= edp) + error = uiomove(dirbuf, readcnt, &auio); + } + FREE(dirbuf, M_TEMP); + } + VOP_UNLOCK(vp, 0, p); + if (error) + return (error); + +#ifdef UNION +{ + extern int (**union_vnodeop_p)(); + extern struct vnode *union_dircache __P((struct vnode*, struct proc*)); + + if ((SCARG(uap, count) == auio.uio_resid) && + (vp->v_op == union_vnodeop_p)) { + struct vnode *lvp; + + lvp = union_dircache(vp, p); + if (lvp != NULLVP) { + struct vattr va; + + /* + * If the directory is opaque, + * then don't show lower entries + */ + error = VOP_GETATTR(vp, &va, fp->f_cred, p); + if (va.va_flags & OPAQUE) { + vput(lvp); + lvp = NULL; + } + } + + if (lvp != NULLVP) { + error = VOP_OPEN(lvp, FREAD, fp->f_cred, p); + if (error) { + vput(lvp); + return (error); + } + VOP_UNLOCK(lvp, 0, p); + fp->f_data = (caddr_t) lvp; + fp->f_offset = 0; + error = vn_close(vp, FREAD, fp->f_cred, p); + if (error) + return (error); + vp = lvp; + goto unionread; + } + } +} +#endif /* UNION */ + + if ((SCARG(uap, count) == auio.uio_resid) && + (vp->v_flag & VROOT) && + (vp->v_mount->mnt_flag & MNT_UNION)) { + struct vnode *tvp = vp; + vp = vp->v_mount->mnt_vnodecovered; + VREF(vp); + fp->f_data = (caddr_t) vp; + fp->f_offset = 0; + vrele(tvp); + goto unionread; + } + error = copyout((caddr_t)&loff, (caddr_t)SCARG(uap, basep), + sizeof(long)); + *retval = SCARG(uap, count) - auio.uio_resid; + return (error); +} +#endif /* COMPAT_43 */ + +/* + * Read a block of directory entries in a file system independent format. + */ +int +getdirentries(p, uap, retval) + struct proc *p; + register struct getdirentries_args /* { + syscallarg(int) fd; + syscallarg(char *) buf; + syscallarg(u_int) count; + syscallarg(long *) basep; + } */ *uap; + register_t *retval; +{ + register struct vnode *vp; + struct file *fp; + struct uio auio; + struct iovec aiov; + long loff; + int error, eofflag; + + if (error = getvnode(p->p_fd, SCARG(uap, fd), &fp)) + return (error); + if ((fp->f_flag & FREAD) == 0) + return (EBADF); + vp = (struct vnode *)fp->f_data; +unionread: + if (vp->v_type != VDIR) + return (EINVAL); + aiov.iov_base = SCARG(uap, buf); + aiov.iov_len = SCARG(uap, count); + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_rw = UIO_READ; + auio.uio_segflg = UIO_USERSPACE; + auio.uio_procp = p; + auio.uio_resid = SCARG(uap, count); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); + loff = auio.uio_offset = fp->f_offset; + error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, + (int *)0, (u_long *)0); + fp->f_offset = auio.uio_offset; + VOP_UNLOCK(vp, 0, p); + if (error) + return (error); + +#ifdef UNION +{ + extern int (**union_vnodeop_p)(); + extern struct vnode *union_dircache __P((struct vnode*, struct proc*)); + + if ((SCARG(uap, count) == auio.uio_resid) && + (vp->v_op == union_vnodeop_p)) { + struct vnode *lvp; + + lvp = union_dircache(vp, p); + if (lvp != NULLVP) { + struct vattr va; + + /* + * If the directory is opaque, + * then don't show lower entries + */ + error = VOP_GETATTR(vp, &va, fp->f_cred, p); + if (va.va_flags & OPAQUE) { + vput(lvp); + lvp = NULL; + } + } + + if (lvp != NULLVP) { + error = VOP_OPEN(lvp, FREAD, fp->f_cred, p); + if (error) { + vput(lvp); + return (error); + } + VOP_UNLOCK(lvp, 0, p); + fp->f_data = (caddr_t) lvp; + fp->f_offset = 0; + error = vn_close(vp, FREAD, fp->f_cred, p); + if (error) + return (error); + vp = lvp; + goto unionread; + } + } +} +#endif /* UNION */ + + if ((SCARG(uap, count) == auio.uio_resid) && + (vp->v_flag & VROOT) && + (vp->v_mount->mnt_flag & MNT_UNION)) { + struct vnode *tvp = vp; + vp = vp->v_mount->mnt_vnodecovered; + VREF(vp); + fp->f_data = (caddr_t) vp; + fp->f_offset = 0; + vrele(tvp); + goto unionread; + } + error = copyout((caddr_t)&loff, (caddr_t)SCARG(uap, basep), + sizeof(long)); + *retval = SCARG(uap, count) - auio.uio_resid; + return (error); +} + +/* + * Set the mode mask for creation of filesystem nodes. + */ +int +umask(p, uap, retval) + struct proc *p; + struct umask_args /* { + syscallarg(int) newmask; + } */ *uap; + register_t *retval; +{ + register struct filedesc *fdp; + + fdp = p->p_fd; + *retval = fdp->fd_cmask; + fdp->fd_cmask = SCARG(uap, newmask) & ALLPERMS; + return (0); +} + +/* + * Void all references to file by ripping underlying filesystem + * away from vnode. + */ +/* ARGSUSED */ +int +revoke(p, uap, retval) + struct proc *p; + register struct revoke_args /* { + syscallarg(char *) path; + } */ *uap; + register_t *retval; +{ + register struct vnode *vp; + struct vattr vattr; + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p); + if (error = namei(&nd)) + return (error); + vp = nd.ni_vp; + if (error = VOP_GETATTR(vp, &vattr, p->p_ucred, p)) + goto out; + if (p->p_ucred->cr_uid != vattr.va_uid && + (error = suser(p->p_ucred, &p->p_acflag))) + goto out; + if (vp->v_usecount > 1 || (vp->v_flag & VALIASED)) + VOP_REVOKE(vp, REVOKEALL); +out: + vrele(vp); + return (error); +} + +/* + * Convert a user file descriptor to a kernel file entry. + */ +int +getvnode(fdp, fd, fpp) + struct filedesc *fdp; + struct file **fpp; + int fd; +{ + struct file *fp; + + if ((u_int)fd >= fdp->fd_nfiles || + (fp = fdp->fd_ofiles[fd]) == NULL) + return (EBADF); + if (fp->f_type != DTYPE_VNODE) + return (EINVAL); + *fpp = fp; + return (0); +} diff --git a/sys/kern/vfs_vnops.c b/sys/kern/vfs_vnops.c new file mode 100644 index 000000000000..3cfc6fd7bca3 --- /dev/null +++ b/sys/kern/vfs_vnops.c @@ -0,0 +1,449 @@ +/* + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vfs_vnops.c 8.14 (Berkeley) 6/15/95 + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/file.h> +#include <sys/stat.h> +#include <sys/buf.h> +#include <sys/proc.h> +#include <sys/mount.h> +#include <sys/namei.h> +#include <sys/vnode.h> +#include <sys/ioctl.h> +#include <sys/tty.h> + +#include <vm/vm.h> + +struct fileops vnops = + { vn_read, vn_write, vn_ioctl, vn_select, vn_closefile }; + +/* + * Common code for vnode open operations. + * Check permissions, and call the VOP_OPEN or VOP_CREATE routine. + */ +vn_open(ndp, fmode, cmode) + register struct nameidata *ndp; + int fmode, cmode; +{ + register struct vnode *vp; + register struct proc *p = ndp->ni_cnd.cn_proc; + register struct ucred *cred = p->p_ucred; + struct vattr vat; + struct vattr *vap = &vat; + int error; + + if (fmode & O_CREAT) { + ndp->ni_cnd.cn_nameiop = CREATE; + ndp->ni_cnd.cn_flags = LOCKPARENT | LOCKLEAF; + if ((fmode & O_EXCL) == 0) + ndp->ni_cnd.cn_flags |= FOLLOW; + if (error = namei(ndp)) + return (error); + if (ndp->ni_vp == NULL) { + VATTR_NULL(vap); + vap->va_type = VREG; + vap->va_mode = cmode; + if (fmode & O_EXCL) + vap->va_vaflags |= VA_EXCLUSIVE; + VOP_LEASE(ndp->ni_dvp, p, cred, LEASE_WRITE); + if (error = VOP_CREATE(ndp->ni_dvp, &ndp->ni_vp, + &ndp->ni_cnd, vap)) + return (error); + fmode &= ~O_TRUNC; + vp = ndp->ni_vp; + } else { + VOP_ABORTOP(ndp->ni_dvp, &ndp->ni_cnd); + if (ndp->ni_dvp == ndp->ni_vp) + vrele(ndp->ni_dvp); + else + vput(ndp->ni_dvp); + ndp->ni_dvp = NULL; + vp = ndp->ni_vp; + if (fmode & O_EXCL) { + error = EEXIST; + goto bad; + } + fmode &= ~O_CREAT; + } + } else { + ndp->ni_cnd.cn_nameiop = LOOKUP; + ndp->ni_cnd.cn_flags = FOLLOW | LOCKLEAF; + if (error = namei(ndp)) + return (error); + vp = ndp->ni_vp; + } + if (vp->v_type == VSOCK) { + error = EOPNOTSUPP; + goto bad; + } + if ((fmode & O_CREAT) == 0) { + if (fmode & FREAD) { + if (error = VOP_ACCESS(vp, VREAD, cred, p)) + goto bad; + } + if (fmode & (FWRITE | O_TRUNC)) { + if (vp->v_type == VDIR) { + error = EISDIR; + goto bad; + } + if ((error = vn_writechk(vp)) || + (error = VOP_ACCESS(vp, VWRITE, cred, p))) + goto bad; + } + } + if (fmode & O_TRUNC) { + VOP_UNLOCK(vp, 0, p); /* XXX */ + VOP_LEASE(vp, p, cred, LEASE_WRITE); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); /* XXX */ + VATTR_NULL(vap); + vap->va_size = 0; + if (error = VOP_SETATTR(vp, vap, cred, p)) + goto bad; + } + if (error = VOP_OPEN(vp, fmode, cred, p)) + goto bad; + if (fmode & FWRITE) + vp->v_writecount++; + return (0); +bad: + vput(vp); + return (error); +} + +/* + * Check for write permissions on the specified vnode. + * Prototype text segments cannot be written. + */ +vn_writechk(vp) + register struct vnode *vp; +{ + + /* + * If there's shared text associated with + * the vnode, try to free it up once. If + * we fail, we can't allow writing. + */ + if ((vp->v_flag & VTEXT) && !vnode_pager_uncache(vp)) + return (ETXTBSY); + return (0); +} + +/* + * Vnode close call + */ +vn_close(vp, flags, cred, p) + register struct vnode *vp; + int flags; + struct ucred *cred; + struct proc *p; +{ + int error; + + if (flags & FWRITE) + vp->v_writecount--; + error = VOP_CLOSE(vp, flags, cred, p); + vrele(vp); + return (error); +} + +/* + * Package up an I/O request on a vnode into a uio and do it. + */ +vn_rdwr(rw, vp, base, len, offset, segflg, ioflg, cred, aresid, p) + enum uio_rw rw; + struct vnode *vp; + caddr_t base; + int len; + off_t offset; + enum uio_seg segflg; + int ioflg; + struct ucred *cred; + int *aresid; + struct proc *p; +{ + struct uio auio; + struct iovec aiov; + int error; + + if ((ioflg & IO_NODELOCKED) == 0) + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + aiov.iov_base = base; + aiov.iov_len = len; + auio.uio_resid = len; + auio.uio_offset = offset; + auio.uio_segflg = segflg; + auio.uio_rw = rw; + auio.uio_procp = p; + if (rw == UIO_READ) { + error = VOP_READ(vp, &auio, ioflg, cred); + } else { + error = VOP_WRITE(vp, &auio, ioflg, cred); + } + if (aresid) + *aresid = auio.uio_resid; + else + if (auio.uio_resid && error == 0) + error = EIO; + if ((ioflg & IO_NODELOCKED) == 0) + VOP_UNLOCK(vp, 0, p); + return (error); +} + +/* + * File table vnode read routine. + */ +vn_read(fp, uio, cred) + struct file *fp; + struct uio *uio; + struct ucred *cred; +{ + struct vnode *vp = (struct vnode *)fp->f_data; + struct proc *p = uio->uio_procp; + int count, error; + + VOP_LEASE(vp, p, cred, LEASE_READ); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); + uio->uio_offset = fp->f_offset; + count = uio->uio_resid; + error = VOP_READ(vp, uio, (fp->f_flag & FNONBLOCK) ? IO_NDELAY : 0, + cred); + fp->f_offset += count - uio->uio_resid; + VOP_UNLOCK(vp, 0, p); + return (error); +} + +/* + * File table vnode write routine. + */ +vn_write(fp, uio, cred) + struct file *fp; + struct uio *uio; + struct ucred *cred; +{ + struct vnode *vp = (struct vnode *)fp->f_data; + struct proc *p = uio->uio_procp; + int count, error, ioflag = IO_UNIT; + + if (vp->v_type == VREG && (fp->f_flag & O_APPEND)) + ioflag |= IO_APPEND; + if (fp->f_flag & FNONBLOCK) + ioflag |= IO_NDELAY; + if ((fp->f_flag & O_FSYNC) || + (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS))) + ioflag |= IO_SYNC; + VOP_LEASE(vp, p, cred, LEASE_WRITE); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); + uio->uio_offset = fp->f_offset; + count = uio->uio_resid; + error = VOP_WRITE(vp, uio, ioflag, cred); + if (ioflag & IO_APPEND) + fp->f_offset = uio->uio_offset; + else + fp->f_offset += count - uio->uio_resid; + VOP_UNLOCK(vp, 0, p); + return (error); +} + +/* + * File table vnode stat routine. + */ +vn_stat(vp, sb, p) + struct vnode *vp; + register struct stat *sb; + struct proc *p; +{ + struct vattr vattr; + register struct vattr *vap; + int error; + u_short mode; + + vap = &vattr; + error = VOP_GETATTR(vp, vap, p->p_ucred, p); + if (error) + return (error); + /* + * Copy from vattr table + */ + sb->st_dev = vap->va_fsid; + sb->st_ino = vap->va_fileid; + mode = vap->va_mode; + switch (vp->v_type) { + case VREG: + mode |= S_IFREG; + break; + case VDIR: + mode |= S_IFDIR; + break; + case VBLK: + mode |= S_IFBLK; + break; + case VCHR: + mode |= S_IFCHR; + break; + case VLNK: + mode |= S_IFLNK; + break; + case VSOCK: + mode |= S_IFSOCK; + break; + case VFIFO: + mode |= S_IFIFO; + break; + default: + return (EBADF); + }; + sb->st_mode = mode; + sb->st_nlink = vap->va_nlink; + sb->st_uid = vap->va_uid; + sb->st_gid = vap->va_gid; + sb->st_rdev = vap->va_rdev; + sb->st_size = vap->va_size; + sb->st_atimespec = vap->va_atime; + sb->st_mtimespec = vap->va_mtime; + sb->st_ctimespec = vap->va_ctime; + sb->st_blksize = vap->va_blocksize; + sb->st_flags = vap->va_flags; + sb->st_gen = vap->va_gen; + sb->st_blocks = vap->va_bytes / S_BLKSIZE; + return (0); +} + +/* + * File table vnode ioctl routine. + */ +vn_ioctl(fp, com, data, p) + struct file *fp; + u_long com; + caddr_t data; + struct proc *p; +{ + register struct vnode *vp = ((struct vnode *)fp->f_data); + struct vattr vattr; + int error; + + switch (vp->v_type) { + + case VREG: + case VDIR: + if (com == FIONREAD) { + if (error = VOP_GETATTR(vp, &vattr, p->p_ucred, p)) + return (error); + *(int *)data = vattr.va_size - fp->f_offset; + return (0); + } + if (com == FIONBIO || com == FIOASYNC) /* XXX */ + return (0); /* XXX */ + /* fall into ... */ + + default: + return (ENOTTY); + + case VFIFO: + case VCHR: + case VBLK: + error = VOP_IOCTL(vp, com, data, fp->f_flag, p->p_ucred, p); + if (error == 0 && com == TIOCSCTTY) { + if (p->p_session->s_ttyvp) + vrele(p->p_session->s_ttyvp); + p->p_session->s_ttyvp = vp; + VREF(vp); + } + return (error); + } +} + +/* + * File table vnode select routine. + */ +vn_select(fp, which, p) + struct file *fp; + int which; + struct proc *p; +{ + + return (VOP_SELECT(((struct vnode *)fp->f_data), which, fp->f_flag, + fp->f_cred, p)); +} + +/* + * Check that the vnode is still valid, and if so + * acquire requested lock. + */ +int +vn_lock(vp, flags, p) + struct vnode *vp; + int flags; + struct proc *p; +{ + int error; + + do { + if ((flags & LK_INTERLOCK) == 0) + simple_lock(&vp->v_interlock); + if (vp->v_flag & VXLOCK) { + vp->v_flag |= VXWANT; + simple_unlock(&vp->v_interlock); + tsleep((caddr_t)vp, PINOD, "vn_lock", 0); + error = ENOENT; + } else { + error = VOP_LOCK(vp, flags | LK_INTERLOCK, p); + if (error == 0) + return (error); + } + flags &= ~LK_INTERLOCK; + } while (flags & LK_RETRY); + return (error); +} + +/* + * File table vnode close routine. + */ +vn_closefile(fp, p) + struct file *fp; + struct proc *p; +{ + + return (vn_close(((struct vnode *)fp->f_data), fp->f_flag, + fp->f_cred, p)); +} diff --git a/sys/kern/vnode_if.sh b/sys/kern/vnode_if.sh new file mode 100644 index 000000000000..8b74d83eca95 --- /dev/null +++ b/sys/kern/vnode_if.sh @@ -0,0 +1,344 @@ +#!/bin/sh - +copyright=' +/* + * Copyright (c) 1992, 1993, 1994, 1995 + * The Regents of the University of California. All rights reserved. + * +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# 3. All advertising materials mentioning features or use of this software +# must display the following acknowledgement: +# This product includes software developed by the University of +# California, Berkeley and its contributors. +# 4. Neither the name of the University nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +# SUCH DAMAGE. + * + * from: NetBSD: vnode_if.sh,v 1.7 1994/08/25 03:04:28 cgd Exp $ + */ +' +SCRIPT_ID='@(#)vnode_if.sh 8.7 (Berkeley) 5/11/95' + +# Script to produce VFS front-end sugar. +# +# usage: vnode_if.sh srcfile +# (where srcfile is currently /sys/kern/vnode_if.src) +# + +if [ $# -ne 1 ] ; then + echo 'usage: vnode_if.sh srcfile' + exit 1 +fi + +# Name of the source file. +src=$1 + +# Names of the created files. +out_c=vnode_if.c +out_h=vnode_if.h + +# Awk program (must support nawk extensions) +# Use "awk" at Berkeley, "nawk" or "gawk" elsewhere. +awk=${AWK:-awk} + +# Does this awk have a "toupper" function? (i.e. is it GNU awk) +isgawk=`$awk 'BEGIN { print toupper("true"); exit; }' 2>/dev/null` + +# If this awk does not define "toupper" then define our own. +if [ "$isgawk" = TRUE ] ; then + # GNU awk provides it. + toupper= +else + # Provide our own toupper() + toupper=' +function toupper(str) { + _toupper_cmd = "echo "str" |tr a-z A-Z" + _toupper_cmd | getline _toupper_str; + close(_toupper_cmd); + return _toupper_str; +}' +fi + +# +# This is the common part of all awk programs that read $src +# This parses the input for one function into the arrays: +# argdir, argtype, argname, willrele +# and calls "doit()" to generate output for the function. +# +# Input to this parser is pre-processed slightly by sed +# so this awk parser doesn't have to work so hard. The +# changes done by the sed pre-processing step are: +# insert a space beween * and pointer name +# replace semicolons with spaces +# +sed_prep='s:\*\([^\*/]\):\* \1:g +s/;/ /' +awk_parser=' +# Comment line +/^#/ { next; } +# First line of description +/^vop_/ { + name=$1; + argc=0; + next; +} +# Last line of description +/^}/ { + doit(); + next; +} +# Middle lines of description +{ + argdir[argc] = $1; i=2; + if ($2 == "WILLRELE") { + willrele[argc] = 1; + i++; + } else + willrele[argc] = 0; + argtype[argc] = $i; i++; + while (i < NF) { + argtype[argc] = argtype[argc]" "$i; + i++; + } + argname[argc] = $i; + argc++; + next; +} +' + +# This is put after the copyright on each generated file. +warning=" +/* + * Warning: This file is generated automatically. + * (Modifications made here may easily be lost!) + * + * Created by the script: + * ${SCRIPT_ID} + */ +" + +# Get rid of ugly spaces +space_elim='s:\([^/]\*\) :\1:g' + +# +# Redirect stdout to the H file. +# +echo "$0: Creating $out_h" 1>&2 +exec > $out_h + +# Begin stuff +echo "$copyright" +echo "$warning" +echo ' +extern struct vnodeop_desc vop_default_desc; +' + +# Body stuff +# This awk program needs toupper() so define it if necessary. +sed -e "$sed_prep" $src | $awk "$toupper"' +function doit() { + # Declare arg struct, descriptor. + printf("\nstruct %s_args {\n", name); + printf("\tstruct vnodeop_desc * a_desc;\n"); + for (i=0; i<argc; i++) { + printf("\t%s a_%s;\n", argtype[i], argname[i]); + } + printf("};\n"); + printf("extern struct vnodeop_desc %s_desc;\n", name); + # Define inline function. + printf("#define %s(", toupper(name)); + for (i=0; i<argc; i++) { + printf("%s", argname[i]); + if (i < (argc-1)) printf(", "); + } + printf(") _%s(", toupper(name)); + for (i=0; i<argc; i++) { + printf("%s", argname[i]); + if (i < (argc-1)) printf(", "); + } + printf(")\n"); + printf("static __inline int _%s(", toupper(name)); + for (i=0; i<argc; i++) { + printf("%s", argname[i]); + if (i < (argc-1)) printf(", "); + } + printf(")\n"); + for (i=0; i<argc; i++) { + printf("\t%s %s;\n", argtype[i], argname[i]); + } + printf("{\n\tstruct %s_args a;\n", name); + printf("\ta.a_desc = VDESC(%s);\n", name); + for (i=0; i<argc; i++) { + printf("\ta.a_%s = %s;\n", argname[i], argname[i]); + } + printf("\treturn (VCALL(%s%s, VOFFSET(%s), &a));\n}\n", + argname[0], arg0special, name); +} +BEGIN { + arg0special=""; +} +END { + printf("\n/* Special cases: */\n#include <sys/buf.h>\n"); + argc=1; + argtype[0]="struct buf *"; + argname[0]="bp"; + arg0special="->b_vp"; + name="vop_strategy"; + doit(); + name="vop_bwrite"; + doit(); +} +'"$awk_parser" | sed -e "$space_elim" + +# End stuff +echo ' +/* End of special cases. */' + + +# +# Redirect stdout to the C file. +# +echo "$0: Creating $out_c" 1>&2 +exec > $out_c + +# Begin stuff +echo "$copyright" +echo "$warning" +echo ' +#include <sys/param.h> +#include <sys/mount.h> +#include <sys/vnode.h> + +struct vnodeop_desc vop_default_desc = { + 0, + "default", + 0, + NULL, + VDESC_NO_OFFSET, + VDESC_NO_OFFSET, + VDESC_NO_OFFSET, + VDESC_NO_OFFSET, + NULL, +}; +' + +# Body stuff +sed -e "$sed_prep" $src | $awk ' +function do_offset(typematch) { + for (i=0; i<argc; i++) { + if (argtype[i] == typematch) { + printf("\tVOPARG_OFFSETOF(struct %s_args, a_%s),\n", + name, argname[i]); + return i; + }; + }; + print "\tVDESC_NO_OFFSET,"; + return -1; +} + +function doit() { + # Define offsets array + printf("\nint %s_vp_offsets[] = {\n", name); + for (i=0; i<argc; i++) { + if (argtype[i] == "struct vnode *") { + printf ("\tVOPARG_OFFSETOF(struct %s_args,a_%s),\n", + name, argname[i]); + } + } + print "\tVDESC_NO_OFFSET"; + print "};"; + # Define F_desc + printf("struct vnodeop_desc %s_desc = {\n", name); + # offset + printf ("\t0,\n"); + # printable name + printf ("\t\"%s\",\n", name); + # flags + printf("\t0"); + vpnum = 0; + for (i=0; i<argc; i++) { + if (willrele[i]) { + if (argdir[i] ~ /OUT/) { + printf(" | VDESC_VPP_WILLRELE"); + } else { + printf(" | VDESC_VP%s_WILLRELE", vpnum); + }; + vpnum++; + } + } + print ","; + # vp offsets + printf ("\t%s_vp_offsets,\n", name); + # vpp (if any) + do_offset("struct vnode **"); + # cred (if any) + do_offset("struct ucred *"); + # proc (if any) + do_offset("struct proc *"); + # componentname + do_offset("struct componentname *"); + # transport layer information + printf ("\tNULL,\n};\n"); +} +END { + printf("\n/* Special cases: */\n"); + argc=1; + argdir[0]="IN"; + argtype[0]="struct buf *"; + argname[0]="bp"; + willrele[0]=0; + name="vop_strategy"; + doit(); + name="vop_bwrite"; + doit(); +} +'"$awk_parser" | sed -e "$space_elim" + +# End stuff +echo ' +/* End of special cases. */' + +# Add the vfs_op_descs array to the C file. +# Begin stuff +echo ' +struct vnodeop_desc *vfs_op_descs[] = { + &vop_default_desc, /* MUST BE FIRST */ + &vop_strategy_desc, /* XXX: SPECIAL CASE */ + &vop_bwrite_desc, /* XXX: SPECIAL CASE */ +' + +# Body stuff +sed -e "$sed_prep" $src | $awk ' +function doit() { + printf("\t&%s_desc,\n", name); +} +'"$awk_parser" + +# End stuff +echo ' NULL +}; +' + +exit 0 + +# Local Variables: +# tab-width: 4 +# End: diff --git a/sys/kern/vnode_if.src b/sys/kern/vnode_if.src new file mode 100644 index 000000000000..1e32f29abd58 --- /dev/null +++ b/sys/kern/vnode_if.src @@ -0,0 +1,494 @@ +# +# Copyright (c) 1992, 1993 +# The Regents of the University of California. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# 3. All advertising materials mentioning features or use of this software +# must display the following acknowledgement: +# This product includes software developed by the University of +# California, Berkeley and its contributors. +# 4. Neither the name of the University nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +# SUCH DAMAGE. +# +# @(#)vnode_if.src 8.12 (Berkeley) 5/14/95 +# + +# +# Above each of the vop descriptors is a specification of the locking +# protocol used by each vop call. The first column is the name of +# the variable, the remaining three columns are in, out and error +# respectively. The "in" column defines the lock state on input, +# the "out" column defines the state on succesful return, and the +# "error" column defines the locking state on error exit. +# +# The locking value can take the following values: +# L: locked. +# U: unlocked/ +# -: not applicable. vnode does not yet (or no longer) exists. +# =: the same on input and output, may be either L or U. +# X: locked if not nil. +# + +# +#% lookup dvp L ? ? +#% lookup vpp - L - +# +# XXX - the lookup locking protocol defies simple description and depends +# on the flags and operation fields in the (cnp) structure. Note +# especially that *vpp may equal dvp and both may be locked. +# +vop_lookup { + IN struct vnode *dvp; + INOUT struct vnode **vpp; + IN struct componentname *cnp; +}; + +# +#% create dvp L U U +#% create vpp - L - +# +vop_create { + IN WILLRELE struct vnode *dvp; + OUT struct vnode **vpp; + IN struct componentname *cnp; + IN struct vattr *vap; +}; + +# +#% whiteout dvp L L L +#% whiteout cnp - - - +#% whiteout flag - - - +# +vop_whiteout { + IN WILLRELE struct vnode *dvp; + IN struct componentname *cnp; + IN int flags; +}; + +# +#% mknod dvp L U U +#% mknod vpp - X - +# +vop_mknod { + IN WILLRELE struct vnode *dvp; + OUT WILLRELE struct vnode **vpp; + IN struct componentname *cnp; + IN struct vattr *vap; +}; + +# +#% open vp L L L +# +vop_open { + IN struct vnode *vp; + IN int mode; + IN struct ucred *cred; + IN struct proc *p; +}; + +# +#% close vp U U U +# +vop_close { + IN struct vnode *vp; + IN int fflag; + IN struct ucred *cred; + IN struct proc *p; +}; + +# +#% access vp L L L +# +vop_access { + IN struct vnode *vp; + IN int mode; + IN struct ucred *cred; + IN struct proc *p; +}; + +# +#% getattr vp = = = +# +vop_getattr { + IN struct vnode *vp; + IN struct vattr *vap; + IN struct ucred *cred; + IN struct proc *p; +}; + +# +#% setattr vp L L L +# +vop_setattr { + IN struct vnode *vp; + IN struct vattr *vap; + IN struct ucred *cred; + IN struct proc *p; +}; + +# +#% read vp L L L +# +vop_read { + IN struct vnode *vp; + INOUT struct uio *uio; + IN int ioflag; + IN struct ucred *cred; +}; + +# +#% write vp L L L +# +vop_write { + IN struct vnode *vp; + INOUT struct uio *uio; + IN int ioflag; + IN struct ucred *cred; +}; + +# +#% lease vp = = = +# +vop_lease { + IN struct vnode *vp; + IN struct proc *p; + IN struct ucred *cred; + IN int flag; +}; + +# +#% ioctl vp U U U +# +vop_ioctl { + IN struct vnode *vp; + IN u_long command; + IN caddr_t data; + IN int fflag; + IN struct ucred *cred; + IN struct proc *p; +}; + +# +#% select vp U U U +# +# Needs work? (fflags) +# +vop_select { + IN struct vnode *vp; + IN int which; + IN int fflags; + IN struct ucred *cred; + IN struct proc *p; +}; + +# +#% revoke vp U U U +# +vop_revoke { + IN struct vnode *vp; + IN int flags; +}; + +# +# XXX - not used +# +vop_mmap { + IN struct vnode *vp; + IN int fflags; + IN struct ucred *cred; + IN struct proc *p; +}; + +# +#% fsync vp L L L +# +vop_fsync { + IN struct vnode *vp; + IN struct ucred *cred; + IN int waitfor; + IN struct proc *p; +}; + +# +# XXX - not used +# Needs work: Is newoff right? What's it mean? +# +vop_seek { + IN struct vnode *vp; + IN off_t oldoff; + IN off_t newoff; + IN struct ucred *cred; +}; + +# +#% remove dvp L U U +#% remove vp L U U +# +vop_remove { + IN WILLRELE struct vnode *dvp; + IN WILLRELE struct vnode *vp; + IN struct componentname *cnp; +}; + +# +#% link vp U U U +#% link tdvp L U U +# +vop_link { + IN WILLRELE struct vnode *vp; + IN struct vnode *tdvp; + IN struct componentname *cnp; +}; + +# +#% rename fdvp U U U +#% rename fvp U U U +#% rename tdvp L U U +#% rename tvp X U U +# +vop_rename { + IN WILLRELE struct vnode *fdvp; + IN WILLRELE struct vnode *fvp; + IN struct componentname *fcnp; + IN WILLRELE struct vnode *tdvp; + IN WILLRELE struct vnode *tvp; + IN struct componentname *tcnp; +}; + +# +#% mkdir dvp L U U +#% mkdir vpp - L - +# +vop_mkdir { + IN WILLRELE struct vnode *dvp; + OUT struct vnode **vpp; + IN struct componentname *cnp; + IN struct vattr *vap; +}; + +# +#% rmdir dvp L U U +#% rmdir vp L U U +# +vop_rmdir { + IN WILLRELE struct vnode *dvp; + IN WILLRELE struct vnode *vp; + IN struct componentname *cnp; +}; + +# +#% symlink dvp L U U +#% symlink vpp - U - +# +# XXX - note that the return vnode has already been VRELE'ed +# by the filesystem layer. To use it you must use vget, +# possibly with a further namei. +# +vop_symlink { + IN WILLRELE struct vnode *dvp; + OUT WILLRELE struct vnode **vpp; + IN struct componentname *cnp; + IN struct vattr *vap; + IN char *target; +}; + +# +#% readdir vp L L L +# +vop_readdir { + IN struct vnode *vp; + INOUT struct uio *uio; + IN struct ucred *cred; + INOUT int *eofflag; + OUT int *ncookies; + INOUT u_long **cookies; +}; + +# +#% readlink vp L L L +# +vop_readlink { + IN struct vnode *vp; + INOUT struct uio *uio; + IN struct ucred *cred; +}; + +# +#% abortop dvp = = = +# +vop_abortop { + IN struct vnode *dvp; + IN struct componentname *cnp; +}; + +# +#% inactive vp L U U +# +vop_inactive { + IN struct vnode *vp; + IN struct proc *p; +}; + +# +#% reclaim vp U U U +# +vop_reclaim { + IN struct vnode *vp; + IN struct proc *p; +}; + +# +#% lock vp U L U +# +vop_lock { + IN struct vnode *vp; + IN int flags; + IN struct proc *p; +}; + +# +#% unlock vp L U L +# +vop_unlock { + IN struct vnode *vp; + IN int flags; + IN struct proc *p; +}; + +# +#% bmap vp L L L +#% bmap vpp - U - +# +vop_bmap { + IN struct vnode *vp; + IN daddr_t bn; + OUT struct vnode **vpp; + IN daddr_t *bnp; + OUT int *runp; +}; + +# +# Needs work: no vp? +# +#vop_strategy { +# IN struct buf *bp; +#}; + +# +#% print vp = = = +# +vop_print { + IN struct vnode *vp; +}; + +# +#% islocked vp = = = +# +vop_islocked { + IN struct vnode *vp; +}; + +# +#% pathconf vp L L L +# +vop_pathconf { + IN struct vnode *vp; + IN int name; + OUT register_t *retval; +}; + +# +#% advlock vp U U U +# +vop_advlock { + IN struct vnode *vp; + IN caddr_t id; + IN int op; + IN struct flock *fl; + IN int flags; +}; + +# +#% blkatoff vp L L L +# +vop_blkatoff { + IN struct vnode *vp; + IN off_t offset; + OUT char **res; + OUT struct buf **bpp; +}; + +# +#% valloc pvp L L L +# +vop_valloc { + IN struct vnode *pvp; + IN int mode; + IN struct ucred *cred; + OUT struct vnode **vpp; +}; + +# +#% reallocblks vp L L L +# +vop_reallocblks { + IN struct vnode *vp; + IN struct cluster_save *buflist; +}; + +# +#% vfree pvp L L L +# +vop_vfree { + IN struct vnode *pvp; + IN ino_t ino; + IN int mode; +}; + +# +#% truncate vp L L L +# +vop_truncate { + IN struct vnode *vp; + IN off_t length; + IN int flags; + IN struct ucred *cred; + IN struct proc *p; +}; + +# +#% update vp L L L +# +vop_update { + IN struct vnode *vp; + IN struct timeval *access; + IN struct timeval *modify; + IN int waitfor; +}; + +# +# Needs work: no vp? +# +#vop_bwrite { +# IN struct buf *bp; +#}; |
