struct pollfd { int fd; //当前描述符 short events; //进程关心的该描述符的事件 short revents; //返回的事件 };
asmlinkage long sys_poll(struct pollfd __user *ufds, unsigned int nfds, long timeout_msecs) { s64 timeout_jiffies;
//超时时间处理 if (timeout_msecs > 0) { #if HZ > 1000 /* We can only overflow if HZ > 1000 */ if (timeout_msecs / 1000 > (s64)0x7fffffffffffffffULL / (s64)HZ) timeout_jiffies = -1; else #endif timeout_jiffies = msecs_to_jiffies(timeout_msecs); } else { /* Infinite (< 0) or no (0) timeout */ timeout_jiffies = timeout_msecs; }
//实际处理函数 return do_sys_poll(ufds, nfds, &timeout_jiffies); }
struct poll_list { struct poll_list *next; int len; struct pollfd entries[0]; };
int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds, s64 *timeout) { struct poll_wqueues table; int fdcount, err; unsigned int i; struct poll_list *head; struct poll_list *walk; /* Allocate small arguments on the stack to save memory and be faster - use long to make sure the buffer is aligned properly on 64 bit archs to avoid unaligned access */ long stack_pps[POLL_STACK_ALLOC/sizeof(long)]; //栈的分配会更快 struct poll_list *stack_pp = NULL;
//检查描述符个数是否超过系统的限制 /* Do a sanity check on nfds ... */ if (nfds > current->signal->rlim[RLIMIT_NOFILE].rlim_cur) return -EINVAL;
//首先是一个初始化工作, 主要初始化poll_table这个函数指针 poll_initwait(&table);
head = NULL; walk = NULL; i = nfds; err = -ENOMEM; //这个循环所作的工作就是将从用户传过来的多个pollfd结构信息拷贝到内核, //由于可能结构的个数可能超过一页内存所能存储的范围,所以就用了循环来完成, //每次拷贝一页内存能装载的个数。并且再将它们用链表链起来。 while(i!=0) { struct poll_list *pp; int num, size; if (stack_pp == NULL) num = N_STACK_PPS; else num = POLLFD_PER_PAGE; //这里保证kmalloc分配的空间不会超过一个页面 if (num > i) num = i; size = sizeof(struct poll_list) + sizeof(struct pollfd)*num;
//如果描述符的个数比较小时,或在比较大的时候,第一次会使用栈来存储 if (!stack_pp) stack_pp = pp = (struct poll_list *)stack_pps; else { pp = kmalloc(size, GFP_KERNEL); if (!pp) goto out_fds; } pp->next=NULL; pp->len = num; if (head == NULL) head = pp; else walk->next = pp;
walk = pp; if (copy_from_user(pp->entries, ufds + nfds-i, sizeof(struct pollfd)*num)) { err = -EFAULT; goto out_fds; } i -= pp->len; }
//真正的POLL操作,返回的结果在head中 fdcount = do_poll(nfds, head, &table, timeout);
//双重循环,将事件拷贝回给用户空间 /* OK, now copy the revents fields back to user space. */ walk = head; err = -EFAULT; while(walk != NULL) { struct pollfd *fds = walk->entries; int j;
for (j=0; j < walk->len; j++, ufds++) { if(__put_user(fds[j].revents, &ufds->revents)) goto out_fds; } walk = walk->next; } err = fdcount; if (!fdcount && signal_pending(current)) err = -EINTR;
//以下是释放空间 out_fds: walk = head; while(walk!=NULL) { struct poll_list *pp = walk->next; if (walk != stack_pp) kfree(walk); walk = pp; } poll_freewait(&table); return err; }
//这个函数就是将当前进程加入等待队列,这个等待队列由驱动或文件系统或网络协议栈来提供 //这个函数是由驱动的file->poll中调用poll_wait()来间接调用的。 /* Add a new entry */ static void __pollwait(struct file *filp, wait_queue_head_t *wait_address, poll_table *p) { struct poll_table_entry *entry = poll_get_entry(p); if (!entry) return; get_file(filp); entry->filp = filp; entry->wait_address = wait_address; init_waitqueue_entry(&entry->wait, current); add_wait_queue(wait_address,&entry->wait); }
void poll_initwait(struct poll_wqueues *pwq) { //在poll()中初始化为__pollwait(),注意在epoll中又会不同 init_poll_funcptr(&pwq->pt, __pollwait); pwq->error = 0; pwq->table = NULL; pwq->inline_index = 0; }
===========================================
static int do_poll(unsigned int nfds, struct poll_list *list, struct poll_wqueues *wait, s64 *timeout) { int count = 0; poll_table* pt = &wait->pt;
/* Optimise the no-wait case */ if (!(*timeout)) //进程不设超时 pt = NULL;
for (;;) { struct poll_list *walk; long __timeout;
也是一个双重循环,处理每个文件描述符事件 set_current_state(TASK_INTERRUPTIBLE); for (walk = list; walk != NULL; walk = walk->next) { struct pollfd * pfd, * pfd_end;
pfd = walk->entries; pfd_end = pfd + walk->len; for (; pfd != pfd_end; pfd++) { /* * Fish for events. If we found one, record it * and kill the poll_table, so we don't * needlessly register any other waiters after * this. They'll get immediately deregistered * when we break out and return. */ if (do_pollfd(pfd, pt)) { //处理每个文件描述符 count++; pt = NULL; } } }
//超时处理 /* * All waiters have already been registered, so don't provide * a poll_table to them on the next loop iteration. */ pt = NULL; if (count || !*timeout || signal_pending(current)) break; count = wait->error; if (count) break;
if (*timeout < 0) { /* Wait indefinitely */ __timeout = MAX_SCHEDULE_TIMEOUT; } else if (unlikely(*timeout >= (s64)MAX_SCHEDULE_TIMEOUT-1)) { /* * Wait for longer than MAX_SCHEDULE_TIMEOUT. Do it in * a loop */ __timeout = MAX_SCHEDULE_TIMEOUT - 1; *timeout -= __timeout; } else { __timeout = *timeout; *timeout = 0; } //进程切换 __timeout = schedule_timeout(__timeout); //进程被唤醒, 继续执行 if (*timeout >= 0) *timeout += __timeout; } __set_current_state(TASK_RUNNING); return count; }
/* * Fish for pollable events on the pollfd->fd file descriptor. We're only * interested in events matching the pollfd->events mask, and the result * matching that mask is both recorded in pollfd->revents and returned. The * pwait poll_table will be used by the fd-provided poll handler for waiting, * if non-NULL. */ static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait) { unsigned int mask; int fd;
mask = 0; fd = pollfd->fd; if (fd >= 0) { int fput_needed; struct file * file;
file = fget_light(fd, &fput_needed); mask = POLLNVAL; if (file != NULL) { mask = DEFAULT_POLLMASK;
//调用驱动或文件系统的poll函数, 是否将当前进程加入驱动的等待队列, //取决是file->poll()第二个参数是否为空. if (file->f_op && file->f_op->poll) mask = file->f_op->poll(file, pwait); /* Mask out unneeded events. */ mask &= pollfd->events | POLLERR | POLLHUP; fput_light(file, fput_needed); } } pollfd->revents = mask; //更新参数返回值
return mask; //如果可读/写返回非0值 }
================================= 驱动或文件系统的poll()实现原型: test_poll(struct file *filep, poll_table *wait) { ... poll_wait(filep, &dev->wait_queue_head, wait); ...
if (dev->readable) mask |= POLLIN | POLLRDNORM;
if (dev->writable) mask |= POLLOUT | POLLWRNORM;
...
}
static inline void poll_wait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p) { if (p && wait_address) p->qproc(filp, wait_address, p); //这个函数就是上面又poll_initwait()初始化的__pollwait()了. }
|