2011-11-12 07:55:49 +08:00
|
|
|
#
|
|
|
|
# 64-bit system call numbers and entry vectors
|
|
|
|
#
|
|
|
|
# The format is:
|
|
|
|
# <number> <abi> <name> <entry point>
|
|
|
|
#
|
2012-02-15 06:18:50 +08:00
|
|
|
# The abi is "common", "64" or "x32" for this file.
|
2011-11-12 07:55:49 +08:00
|
|
|
#
|
2012-02-15 06:18:50 +08:00
|
|
|
0 common read sys_read
|
|
|
|
1 common write sys_write
|
|
|
|
2 common open sys_open
|
|
|
|
3 common close sys_close
|
|
|
|
4 common stat sys_newstat
|
|
|
|
5 common fstat sys_newfstat
|
|
|
|
6 common lstat sys_newlstat
|
|
|
|
7 common poll sys_poll
|
|
|
|
8 common lseek sys_lseek
|
|
|
|
9 common mmap sys_mmap
|
|
|
|
10 common mprotect sys_mprotect
|
|
|
|
11 common munmap sys_munmap
|
|
|
|
12 common brk sys_brk
|
2011-11-12 07:55:49 +08:00
|
|
|
13 64 rt_sigaction sys_rt_sigaction
|
2012-02-15 06:18:50 +08:00
|
|
|
14 common rt_sigprocmask sys_rt_sigprocmask
|
2016-01-29 07:11:26 +08:00
|
|
|
15 64 rt_sigreturn sys_rt_sigreturn/ptregs
|
2011-11-12 07:55:49 +08:00
|
|
|
16 64 ioctl sys_ioctl
|
2012-02-15 06:18:50 +08:00
|
|
|
17 common pread64 sys_pread64
|
|
|
|
18 common pwrite64 sys_pwrite64
|
2011-11-12 07:55:49 +08:00
|
|
|
19 64 readv sys_readv
|
|
|
|
20 64 writev sys_writev
|
2012-02-15 06:18:50 +08:00
|
|
|
21 common access sys_access
|
|
|
|
22 common pipe sys_pipe
|
|
|
|
23 common select sys_select
|
|
|
|
24 common sched_yield sys_sched_yield
|
|
|
|
25 common mremap sys_mremap
|
|
|
|
26 common msync sys_msync
|
|
|
|
27 common mincore sys_mincore
|
|
|
|
28 common madvise sys_madvise
|
|
|
|
29 common shmget sys_shmget
|
|
|
|
30 common shmat sys_shmat
|
|
|
|
31 common shmctl sys_shmctl
|
|
|
|
32 common dup sys_dup
|
|
|
|
33 common dup2 sys_dup2
|
|
|
|
34 common pause sys_pause
|
|
|
|
35 common nanosleep sys_nanosleep
|
|
|
|
36 common getitimer sys_getitimer
|
|
|
|
37 common alarm sys_alarm
|
|
|
|
38 common setitimer sys_setitimer
|
|
|
|
39 common getpid sys_getpid
|
|
|
|
40 common sendfile sys_sendfile64
|
|
|
|
41 common socket sys_socket
|
|
|
|
42 common connect sys_connect
|
|
|
|
43 common accept sys_accept
|
|
|
|
44 common sendto sys_sendto
|
2011-11-12 07:55:49 +08:00
|
|
|
45 64 recvfrom sys_recvfrom
|
|
|
|
46 64 sendmsg sys_sendmsg
|
|
|
|
47 64 recvmsg sys_recvmsg
|
2012-02-15 06:18:50 +08:00
|
|
|
48 common shutdown sys_shutdown
|
|
|
|
49 common bind sys_bind
|
|
|
|
50 common listen sys_listen
|
|
|
|
51 common getsockname sys_getsockname
|
|
|
|
52 common getpeername sys_getpeername
|
|
|
|
53 common socketpair sys_socketpair
|
x32: Use compat shims for {g,s}etsockopt
Some of the arguments to {g,s}etsockopt are passed in userland pointers.
If we try to use the 64bit entry point, we end up sometimes failing.
For example, dhcpcd doesn't run in x32:
# dhcpcd eth0
dhcpcd[1979]: version 5.5.6 starting
dhcpcd[1979]: eth0: broadcasting for a lease
dhcpcd[1979]: eth0: open_socket: Invalid argument
dhcpcd[1979]: eth0: send_raw_packet: Bad file descriptor
The code in particular is getting back EINVAL when doing:
struct sock_fprog pf;
setsockopt(s, SOL_SOCKET, SO_ATTACH_FILTER, &pf, sizeof(pf));
Diving into the kernel code, we can see:
include/linux/filter.h:
struct sock_fprog {
unsigned short len;
struct sock_filter __user *filter;
};
net/core/sock.c:
case SO_ATTACH_FILTER:
ret = -EINVAL;
if (optlen == sizeof(struct sock_fprog)) {
struct sock_fprog fprog;
ret = -EFAULT;
if (copy_from_user(&fprog, optval, sizeof(fprog)))
break;
ret = sk_attach_filter(&fprog, sk);
}
break;
arch/x86/syscalls/syscall_64.tbl:
54 common setsockopt sys_setsockopt
55 common getsockopt sys_getsockopt
So for x64, sizeof(sock_fprog) is 16 bytes. For x86/x32, it's 8 bytes.
This comes down to the pointer being 32bit for x32, which means we need
to do structure size translation. But since x32 comes in directly to
sys_setsockopt, it doesn't get translated like x86.
After changing the syscall table and rebuilding glibc with the new kernel
headers, dhcp runs fine in an x32 userland.
Oddly, it seems like Linus noted the same thing during the initial port,
but I guess that was missed/lost along the way:
https://lkml.org/lkml/2011/8/26/452
[ hpa: tagging for -stable since this is an ABI fix. ]
Bugzilla: https://bugs.gentoo.org/423649
Reported-by: Mads <mads@ab3.no>
Signed-off-by: Mike Frysinger <vapier@gentoo.org>
Link: http://lkml.kernel.org/r/1345320697-15713-1-git-send-email-vapier@gentoo.org
Cc: H. J. Lu <hjl.tools@gmail.com>
Cc: <stable@vger.kernel.org> v3.4..v3.5
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
2012-08-19 04:11:37 +08:00
|
|
|
54 64 setsockopt sys_setsockopt
|
|
|
|
55 64 getsockopt sys_getsockopt
|
2016-01-29 07:11:26 +08:00
|
|
|
56 common clone sys_clone/ptregs
|
|
|
|
57 common fork sys_fork/ptregs
|
|
|
|
58 common vfork sys_vfork/ptregs
|
|
|
|
59 64 execve sys_execve/ptregs
|
2012-02-15 06:18:50 +08:00
|
|
|
60 common exit sys_exit
|
|
|
|
61 common wait4 sys_wait4
|
|
|
|
62 common kill sys_kill
|
|
|
|
63 common uname sys_newuname
|
|
|
|
64 common semget sys_semget
|
|
|
|
65 common semop sys_semop
|
|
|
|
66 common semctl sys_semctl
|
|
|
|
67 common shmdt sys_shmdt
|
|
|
|
68 common msgget sys_msgget
|
|
|
|
69 common msgsnd sys_msgsnd
|
|
|
|
70 common msgrcv sys_msgrcv
|
|
|
|
71 common msgctl sys_msgctl
|
|
|
|
72 common fcntl sys_fcntl
|
|
|
|
73 common flock sys_flock
|
|
|
|
74 common fsync sys_fsync
|
|
|
|
75 common fdatasync sys_fdatasync
|
|
|
|
76 common truncate sys_truncate
|
|
|
|
77 common ftruncate sys_ftruncate
|
|
|
|
78 common getdents sys_getdents
|
|
|
|
79 common getcwd sys_getcwd
|
|
|
|
80 common chdir sys_chdir
|
|
|
|
81 common fchdir sys_fchdir
|
|
|
|
82 common rename sys_rename
|
|
|
|
83 common mkdir sys_mkdir
|
|
|
|
84 common rmdir sys_rmdir
|
|
|
|
85 common creat sys_creat
|
|
|
|
86 common link sys_link
|
|
|
|
87 common unlink sys_unlink
|
|
|
|
88 common symlink sys_symlink
|
|
|
|
89 common readlink sys_readlink
|
|
|
|
90 common chmod sys_chmod
|
|
|
|
91 common fchmod sys_fchmod
|
|
|
|
92 common chown sys_chown
|
|
|
|
93 common fchown sys_fchown
|
|
|
|
94 common lchown sys_lchown
|
|
|
|
95 common umask sys_umask
|
|
|
|
96 common gettimeofday sys_gettimeofday
|
|
|
|
97 common getrlimit sys_getrlimit
|
|
|
|
98 common getrusage sys_getrusage
|
|
|
|
99 common sysinfo sys_sysinfo
|
2012-03-06 05:40:24 +08:00
|
|
|
100 common times sys_times
|
2012-03-06 07:32:11 +08:00
|
|
|
101 64 ptrace sys_ptrace
|
2012-02-15 06:18:50 +08:00
|
|
|
102 common getuid sys_getuid
|
|
|
|
103 common syslog sys_syslog
|
|
|
|
104 common getgid sys_getgid
|
|
|
|
105 common setuid sys_setuid
|
|
|
|
106 common setgid sys_setgid
|
|
|
|
107 common geteuid sys_geteuid
|
|
|
|
108 common getegid sys_getegid
|
|
|
|
109 common setpgid sys_setpgid
|
|
|
|
110 common getppid sys_getppid
|
|
|
|
111 common getpgrp sys_getpgrp
|
|
|
|
112 common setsid sys_setsid
|
|
|
|
113 common setreuid sys_setreuid
|
|
|
|
114 common setregid sys_setregid
|
|
|
|
115 common getgroups sys_getgroups
|
|
|
|
116 common setgroups sys_setgroups
|
|
|
|
117 common setresuid sys_setresuid
|
|
|
|
118 common getresuid sys_getresuid
|
|
|
|
119 common setresgid sys_setresgid
|
|
|
|
120 common getresgid sys_getresgid
|
|
|
|
121 common getpgid sys_getpgid
|
|
|
|
122 common setfsuid sys_setfsuid
|
|
|
|
123 common setfsgid sys_setfsgid
|
|
|
|
124 common getsid sys_getsid
|
|
|
|
125 common capget sys_capget
|
|
|
|
126 common capset sys_capset
|
2011-11-12 07:55:49 +08:00
|
|
|
127 64 rt_sigpending sys_rt_sigpending
|
|
|
|
128 64 rt_sigtimedwait sys_rt_sigtimedwait
|
|
|
|
129 64 rt_sigqueueinfo sys_rt_sigqueueinfo
|
2012-02-15 06:18:50 +08:00
|
|
|
130 common rt_sigsuspend sys_rt_sigsuspend
|
2012-12-15 03:09:47 +08:00
|
|
|
131 64 sigaltstack sys_sigaltstack
|
2012-02-15 06:18:50 +08:00
|
|
|
132 common utime sys_utime
|
|
|
|
133 common mknod sys_mknod
|
2011-11-12 07:55:49 +08:00
|
|
|
134 64 uselib
|
2012-02-15 06:18:50 +08:00
|
|
|
135 common personality sys_personality
|
|
|
|
136 common ustat sys_ustat
|
|
|
|
137 common statfs sys_statfs
|
|
|
|
138 common fstatfs sys_fstatfs
|
|
|
|
139 common sysfs sys_sysfs
|
|
|
|
140 common getpriority sys_getpriority
|
|
|
|
141 common setpriority sys_setpriority
|
|
|
|
142 common sched_setparam sys_sched_setparam
|
|
|
|
143 common sched_getparam sys_sched_getparam
|
|
|
|
144 common sched_setscheduler sys_sched_setscheduler
|
|
|
|
145 common sched_getscheduler sys_sched_getscheduler
|
|
|
|
146 common sched_get_priority_max sys_sched_get_priority_max
|
|
|
|
147 common sched_get_priority_min sys_sched_get_priority_min
|
|
|
|
148 common sched_rr_get_interval sys_sched_rr_get_interval
|
|
|
|
149 common mlock sys_mlock
|
|
|
|
150 common munlock sys_munlock
|
|
|
|
151 common mlockall sys_mlockall
|
|
|
|
152 common munlockall sys_munlockall
|
|
|
|
153 common vhangup sys_vhangup
|
|
|
|
154 common modify_ldt sys_modify_ldt
|
|
|
|
155 common pivot_root sys_pivot_root
|
2011-11-12 07:55:49 +08:00
|
|
|
156 64 _sysctl sys_sysctl
|
2012-02-15 06:18:50 +08:00
|
|
|
157 common prctl sys_prctl
|
|
|
|
158 common arch_prctl sys_arch_prctl
|
|
|
|
159 common adjtimex sys_adjtimex
|
|
|
|
160 common setrlimit sys_setrlimit
|
|
|
|
161 common chroot sys_chroot
|
|
|
|
162 common sync sys_sync
|
|
|
|
163 common acct sys_acct
|
|
|
|
164 common settimeofday sys_settimeofday
|
|
|
|
165 common mount sys_mount
|
|
|
|
166 common umount2 sys_umount
|
|
|
|
167 common swapon sys_swapon
|
|
|
|
168 common swapoff sys_swapoff
|
|
|
|
169 common reboot sys_reboot
|
|
|
|
170 common sethostname sys_sethostname
|
|
|
|
171 common setdomainname sys_setdomainname
|
2016-02-01 01:33:28 +08:00
|
|
|
172 common iopl sys_iopl/ptregs
|
2012-02-15 06:18:50 +08:00
|
|
|
173 common ioperm sys_ioperm
|
2011-11-12 07:55:49 +08:00
|
|
|
174 64 create_module
|
2012-02-15 06:18:50 +08:00
|
|
|
175 common init_module sys_init_module
|
|
|
|
176 common delete_module sys_delete_module
|
2011-11-12 07:55:49 +08:00
|
|
|
177 64 get_kernel_syms
|
|
|
|
178 64 query_module
|
2012-02-15 06:18:50 +08:00
|
|
|
179 common quotactl sys_quotactl
|
2011-11-12 07:55:49 +08:00
|
|
|
180 64 nfsservctl
|
2012-02-15 06:18:50 +08:00
|
|
|
181 common getpmsg
|
|
|
|
182 common putpmsg
|
|
|
|
183 common afs_syscall
|
|
|
|
184 common tuxcall
|
|
|
|
185 common security
|
|
|
|
186 common gettid sys_gettid
|
|
|
|
187 common readahead sys_readahead
|
|
|
|
188 common setxattr sys_setxattr
|
|
|
|
189 common lsetxattr sys_lsetxattr
|
|
|
|
190 common fsetxattr sys_fsetxattr
|
|
|
|
191 common getxattr sys_getxattr
|
|
|
|
192 common lgetxattr sys_lgetxattr
|
|
|
|
193 common fgetxattr sys_fgetxattr
|
|
|
|
194 common listxattr sys_listxattr
|
|
|
|
195 common llistxattr sys_llistxattr
|
|
|
|
196 common flistxattr sys_flistxattr
|
|
|
|
197 common removexattr sys_removexattr
|
|
|
|
198 common lremovexattr sys_lremovexattr
|
|
|
|
199 common fremovexattr sys_fremovexattr
|
|
|
|
200 common tkill sys_tkill
|
|
|
|
201 common time sys_time
|
|
|
|
202 common futex sys_futex
|
|
|
|
203 common sched_setaffinity sys_sched_setaffinity
|
|
|
|
204 common sched_getaffinity sys_sched_getaffinity
|
2011-11-12 07:55:49 +08:00
|
|
|
205 64 set_thread_area
|
2014-05-05 08:43:15 +08:00
|
|
|
206 64 io_setup sys_io_setup
|
2012-02-15 06:18:50 +08:00
|
|
|
207 common io_destroy sys_io_destroy
|
|
|
|
208 common io_getevents sys_io_getevents
|
2014-05-05 08:43:15 +08:00
|
|
|
209 64 io_submit sys_io_submit
|
2012-02-15 06:18:50 +08:00
|
|
|
210 common io_cancel sys_io_cancel
|
2011-11-12 07:55:49 +08:00
|
|
|
211 64 get_thread_area
|
2012-02-15 06:18:50 +08:00
|
|
|
212 common lookup_dcookie sys_lookup_dcookie
|
|
|
|
213 common epoll_create sys_epoll_create
|
2011-11-12 07:55:49 +08:00
|
|
|
214 64 epoll_ctl_old
|
|
|
|
215 64 epoll_wait_old
|
2012-02-15 06:18:50 +08:00
|
|
|
216 common remap_file_pages sys_remap_file_pages
|
|
|
|
217 common getdents64 sys_getdents64
|
|
|
|
218 common set_tid_address sys_set_tid_address
|
|
|
|
219 common restart_syscall sys_restart_syscall
|
|
|
|
220 common semtimedop sys_semtimedop
|
|
|
|
221 common fadvise64 sys_fadvise64
|
2011-11-12 07:55:49 +08:00
|
|
|
222 64 timer_create sys_timer_create
|
2012-02-15 06:18:50 +08:00
|
|
|
223 common timer_settime sys_timer_settime
|
|
|
|
224 common timer_gettime sys_timer_gettime
|
|
|
|
225 common timer_getoverrun sys_timer_getoverrun
|
|
|
|
226 common timer_delete sys_timer_delete
|
|
|
|
227 common clock_settime sys_clock_settime
|
|
|
|
228 common clock_gettime sys_clock_gettime
|
|
|
|
229 common clock_getres sys_clock_getres
|
|
|
|
230 common clock_nanosleep sys_clock_nanosleep
|
|
|
|
231 common exit_group sys_exit_group
|
|
|
|
232 common epoll_wait sys_epoll_wait
|
|
|
|
233 common epoll_ctl sys_epoll_ctl
|
|
|
|
234 common tgkill sys_tgkill
|
|
|
|
235 common utimes sys_utimes
|
2011-11-12 07:55:49 +08:00
|
|
|
236 64 vserver
|
2012-02-15 06:18:50 +08:00
|
|
|
237 common mbind sys_mbind
|
|
|
|
238 common set_mempolicy sys_set_mempolicy
|
|
|
|
239 common get_mempolicy sys_get_mempolicy
|
|
|
|
240 common mq_open sys_mq_open
|
|
|
|
241 common mq_unlink sys_mq_unlink
|
|
|
|
242 common mq_timedsend sys_mq_timedsend
|
|
|
|
243 common mq_timedreceive sys_mq_timedreceive
|
2011-11-12 07:55:49 +08:00
|
|
|
244 64 mq_notify sys_mq_notify
|
2012-02-15 06:18:50 +08:00
|
|
|
245 common mq_getsetattr sys_mq_getsetattr
|
2011-11-12 07:55:49 +08:00
|
|
|
246 64 kexec_load sys_kexec_load
|
|
|
|
247 64 waitid sys_waitid
|
2012-02-15 06:18:50 +08:00
|
|
|
248 common add_key sys_add_key
|
|
|
|
249 common request_key sys_request_key
|
|
|
|
250 common keyctl sys_keyctl
|
|
|
|
251 common ioprio_set sys_ioprio_set
|
|
|
|
252 common ioprio_get sys_ioprio_get
|
|
|
|
253 common inotify_init sys_inotify_init
|
|
|
|
254 common inotify_add_watch sys_inotify_add_watch
|
|
|
|
255 common inotify_rm_watch sys_inotify_rm_watch
|
|
|
|
256 common migrate_pages sys_migrate_pages
|
|
|
|
257 common openat sys_openat
|
|
|
|
258 common mkdirat sys_mkdirat
|
|
|
|
259 common mknodat sys_mknodat
|
|
|
|
260 common fchownat sys_fchownat
|
|
|
|
261 common futimesat sys_futimesat
|
|
|
|
262 common newfstatat sys_newfstatat
|
|
|
|
263 common unlinkat sys_unlinkat
|
|
|
|
264 common renameat sys_renameat
|
|
|
|
265 common linkat sys_linkat
|
|
|
|
266 common symlinkat sys_symlinkat
|
|
|
|
267 common readlinkat sys_readlinkat
|
|
|
|
268 common fchmodat sys_fchmodat
|
|
|
|
269 common faccessat sys_faccessat
|
|
|
|
270 common pselect6 sys_pselect6
|
|
|
|
271 common ppoll sys_ppoll
|
|
|
|
272 common unshare sys_unshare
|
2011-11-12 07:55:49 +08:00
|
|
|
273 64 set_robust_list sys_set_robust_list
|
|
|
|
274 64 get_robust_list sys_get_robust_list
|
2012-02-15 06:18:50 +08:00
|
|
|
275 common splice sys_splice
|
|
|
|
276 common tee sys_tee
|
|
|
|
277 common sync_file_range sys_sync_file_range
|
2011-11-12 07:55:49 +08:00
|
|
|
278 64 vmsplice sys_vmsplice
|
|
|
|
279 64 move_pages sys_move_pages
|
2012-02-15 06:18:50 +08:00
|
|
|
280 common utimensat sys_utimensat
|
|
|
|
281 common epoll_pwait sys_epoll_pwait
|
|
|
|
282 common signalfd sys_signalfd
|
|
|
|
283 common timerfd_create sys_timerfd_create
|
|
|
|
284 common eventfd sys_eventfd
|
|
|
|
285 common fallocate sys_fallocate
|
|
|
|
286 common timerfd_settime sys_timerfd_settime
|
|
|
|
287 common timerfd_gettime sys_timerfd_gettime
|
|
|
|
288 common accept4 sys_accept4
|
|
|
|
289 common signalfd4 sys_signalfd4
|
|
|
|
290 common eventfd2 sys_eventfd2
|
|
|
|
291 common epoll_create1 sys_epoll_create1
|
|
|
|
292 common dup3 sys_dup3
|
|
|
|
293 common pipe2 sys_pipe2
|
|
|
|
294 common inotify_init1 sys_inotify_init1
|
2011-11-12 07:55:49 +08:00
|
|
|
295 64 preadv sys_preadv
|
|
|
|
296 64 pwritev sys_pwritev
|
|
|
|
297 64 rt_tgsigqueueinfo sys_rt_tgsigqueueinfo
|
2012-02-15 06:18:50 +08:00
|
|
|
298 common perf_event_open sys_perf_event_open
|
2011-11-12 07:55:49 +08:00
|
|
|
299 64 recvmmsg sys_recvmmsg
|
2012-02-15 06:18:50 +08:00
|
|
|
300 common fanotify_init sys_fanotify_init
|
|
|
|
301 common fanotify_mark sys_fanotify_mark
|
|
|
|
302 common prlimit64 sys_prlimit64
|
|
|
|
303 common name_to_handle_at sys_name_to_handle_at
|
|
|
|
304 common open_by_handle_at sys_open_by_handle_at
|
|
|
|
305 common clock_adjtime sys_clock_adjtime
|
|
|
|
306 common syncfs sys_syncfs
|
2011-11-12 07:55:49 +08:00
|
|
|
307 64 sendmmsg sys_sendmmsg
|
2012-02-15 06:18:50 +08:00
|
|
|
308 common setns sys_setns
|
|
|
|
309 common getcpu sys_getcpu
|
2011-11-12 07:55:49 +08:00
|
|
|
310 64 process_vm_readv sys_process_vm_readv
|
|
|
|
311 64 process_vm_writev sys_process_vm_writev
|
2012-08-02 06:59:58 +08:00
|
|
|
312 common kcmp sys_kcmp
|
module: add syscall to load module from fd
As part of the effort to create a stronger boundary between root and
kernel, Chrome OS wants to be able to enforce that kernel modules are
being loaded only from our read-only crypto-hash verified (dm_verity)
root filesystem. Since the init_module syscall hands the kernel a module
as a memory blob, no reasoning about the origin of the blob can be made.
Earlier proposals for appending signatures to kernel modules would not be
useful in Chrome OS, since it would involve adding an additional set of
keys to our kernel and builds for no good reason: we already trust the
contents of our root filesystem. We don't need to verify those kernel
modules a second time. Having to do signature checking on module loading
would slow us down and be redundant. All we need to know is where a
module is coming from so we can say yes/no to loading it.
If a file descriptor is used as the source of a kernel module, many more
things can be reasoned about. In Chrome OS's case, we could enforce that
the module lives on the filesystem we expect it to live on. In the case
of IMA (or other LSMs), it would be possible, for example, to examine
extended attributes that may contain signatures over the contents of
the module.
This introduces a new syscall (on x86), similar to init_module, that has
only two arguments. The first argument is used as a file descriptor to
the module and the second argument is a pointer to the NULL terminated
string of module arguments.
Signed-off-by: Kees Cook <keescook@chromium.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au> (merge fixes)
2012-10-16 05:01:07 +08:00
|
|
|
313 common finit_module sys_finit_module
|
sched: Add new scheduler syscalls to support an extended scheduling parameters ABI
Add the syscalls needed for supporting scheduling algorithms
with extended scheduling parameters (e.g., SCHED_DEADLINE).
In general, it makes possible to specify a periodic/sporadic task,
that executes for a given amount of runtime at each instance, and is
scheduled according to the urgency of their own timing constraints,
i.e.:
- a (maximum/typical) instance execution time,
- a minimum interval between consecutive instances,
- a time constraint by which each instance must be completed.
Thus, both the data structure that holds the scheduling parameters of
the tasks and the system calls dealing with it must be extended.
Unfortunately, modifying the existing struct sched_param would break
the ABI and result in potentially serious compatibility issues with
legacy binaries.
For these reasons, this patch:
- defines the new struct sched_attr, containing all the fields
that are necessary for specifying a task in the computational
model described above;
- defines and implements the new scheduling related syscalls that
manipulate it, i.e., sched_setattr() and sched_getattr().
Syscalls are introduced for x86 (32 and 64 bits) and ARM only, as a
proof of concept and for developing and testing purposes. Making them
available on other architectures is straightforward.
Since no "user" for these new parameters is introduced in this patch,
the implementation of the new system calls is just identical to their
already existing counterpart. Future patches that implement scheduling
policies able to exploit the new data structure must also take care of
modifying the sched_*attr() calls accordingly with their own purposes.
Signed-off-by: Dario Faggioli <raistlin@linux.it>
[ Rewrote to use sched_attr. ]
Signed-off-by: Juri Lelli <juri.lelli@gmail.com>
[ Removed sched_setscheduler2() for now. ]
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1383831828-15501-3-git-send-email-juri.lelli@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2013-11-07 21:43:36 +08:00
|
|
|
314 common sched_setattr sys_sched_setattr
|
|
|
|
315 common sched_getattr sys_sched_getattr
|
2014-04-01 23:08:42 +08:00
|
|
|
316 common renameat2 sys_renameat2
|
2014-06-26 07:08:24 +08:00
|
|
|
317 common seccomp sys_seccomp
|
random: introduce getrandom(2) system call
The getrandom(2) system call was requested by the LibreSSL Portable
developers. It is analoguous to the getentropy(2) system call in
OpenBSD.
The rationale of this system call is to provide resiliance against
file descriptor exhaustion attacks, where the attacker consumes all
available file descriptors, forcing the use of the fallback code where
/dev/[u]random is not available. Since the fallback code is often not
well-tested, it is better to eliminate this potential failure mode
entirely.
The other feature provided by this new system call is the ability to
request randomness from the /dev/urandom entropy pool, but to block
until at least 128 bits of entropy has been accumulated in the
/dev/urandom entropy pool. Historically, the emphasis in the
/dev/urandom development has been to ensure that urandom pool is
initialized as quickly as possible after system boot, and preferably
before the init scripts start execution.
This is because changing /dev/urandom reads to block represents an
interface change that could potentially break userspace which is not
acceptable. In practice, on most x86 desktop and server systems, in
general the entropy pool can be initialized before it is needed (and
in modern kernels, we will printk a warning message if not). However,
on an embedded system, this may not be the case. And so with this new
interface, we can provide the functionality of blocking until the
urandom pool has been initialized. Any userspace program which uses
this new functionality must take care to assure that if it is used
during the boot process, that it will not cause the init scripts or
other portions of the system startup to hang indefinitely.
SYNOPSIS
#include <linux/random.h>
int getrandom(void *buf, size_t buflen, unsigned int flags);
DESCRIPTION
The system call getrandom() fills the buffer pointed to by buf
with up to buflen random bytes which can be used to seed user
space random number generators (i.e., DRBG's) or for other
cryptographic uses. It should not be used for Monte Carlo
simulations or other programs/algorithms which are doing
probabilistic sampling.
If the GRND_RANDOM flags bit is set, then draw from the
/dev/random pool instead of the /dev/urandom pool. The
/dev/random pool is limited based on the entropy that can be
obtained from environmental noise, so if there is insufficient
entropy, the requested number of bytes may not be returned.
If there is no entropy available at all, getrandom(2) will
either block, or return an error with errno set to EAGAIN if
the GRND_NONBLOCK bit is set in flags.
If the GRND_RANDOM bit is not set, then the /dev/urandom pool
will be used. Unlike using read(2) to fetch data from
/dev/urandom, if the urandom pool has not been sufficiently
initialized, getrandom(2) will block (or return -1 with the
errno set to EAGAIN if the GRND_NONBLOCK bit is set in flags).
The getentropy(2) system call in OpenBSD can be emulated using
the following function:
int getentropy(void *buf, size_t buflen)
{
int ret;
if (buflen > 256)
goto failure;
ret = getrandom(buf, buflen, 0);
if (ret < 0)
return ret;
if (ret == buflen)
return 0;
failure:
errno = EIO;
return -1;
}
RETURN VALUE
On success, the number of bytes that was filled in the buf is
returned. This may not be all the bytes requested by the
caller via buflen if insufficient entropy was present in the
/dev/random pool, or if the system call was interrupted by a
signal.
On error, -1 is returned, and errno is set appropriately.
ERRORS
EINVAL An invalid flag was passed to getrandom(2)
EFAULT buf is outside the accessible address space.
EAGAIN The requested entropy was not available, and
getentropy(2) would have blocked if the
GRND_NONBLOCK flag was not set.
EINTR While blocked waiting for entropy, the call was
interrupted by a signal handler; see the description
of how interrupted read(2) calls on "slow" devices
are handled with and without the SA_RESTART flag
in the signal(7) man page.
NOTES
For small requests (buflen <= 256) getrandom(2) will not
return EINTR when reading from the urandom pool once the
entropy pool has been initialized, and it will return all of
the bytes that have been requested. This is the recommended
way to use getrandom(2), and is designed for compatibility
with OpenBSD's getentropy() system call.
However, if you are using GRND_RANDOM, then getrandom(2) may
block until the entropy accounting determines that sufficient
environmental noise has been gathered such that getrandom(2)
will be operating as a NRBG instead of a DRBG for those people
who are working in the NIST SP 800-90 regime. Since it may
block for a long time, these guarantees do *not* apply. The
user may want to interrupt a hanging process using a signal,
so blocking until all of the requested bytes are returned
would be unfriendly.
For this reason, the user of getrandom(2) MUST always check
the return value, in case it returns some error, or if fewer
bytes than requested was returned. In the case of
!GRND_RANDOM and small request, the latter should never
happen, but the careful userspace code (and all crypto code
should be careful) should check for this anyway!
Finally, unless you are doing long-term key generation (and
perhaps not even then), you probably shouldn't be using
GRND_RANDOM. The cryptographic algorithms used for
/dev/urandom are quite conservative, and so should be
sufficient for all purposes. The disadvantage of GRND_RANDOM
is that it can block, and the increased complexity required to
deal with partially fulfilled getrandom(2) requests.
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Reviewed-by: Zach Brown <zab@zabbo.net>
2014-07-17 16:13:05 +08:00
|
|
|
318 common getrandom sys_getrandom
|
shm: add memfd_create() syscall
memfd_create() is similar to mmap(MAP_ANON), but returns a file-descriptor
that you can pass to mmap(). It can support sealing and avoids any
connection to user-visible mount-points. Thus, it's not subject to quotas
on mounted file-systems, but can be used like malloc()'ed memory, but with
a file-descriptor to it.
memfd_create() returns the raw shmem file, so calls like ftruncate() can
be used to modify the underlying inode. Also calls like fstat() will
return proper information and mark the file as regular file. If you want
sealing, you can specify MFD_ALLOW_SEALING. Otherwise, sealing is not
supported (like on all other regular files).
Compared to O_TMPFILE, it does not require a tmpfs mount-point and is not
subject to a filesystem size limit. It is still properly accounted to
memcg limits, though, and to the same overcommit or no-overcommit
accounting as all user memory.
Signed-off-by: David Herrmann <dh.herrmann@gmail.com>
Acked-by: Hugh Dickins <hughd@google.com>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: Ryan Lortie <desrt@desrt.ca>
Cc: Lennart Poettering <lennart@poettering.net>
Cc: Daniel Mack <zonque@gmail.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-08-09 05:25:29 +08:00
|
|
|
319 common memfd_create sys_memfd_create
|
2014-08-09 05:25:55 +08:00
|
|
|
320 common kexec_file_load sys_kexec_file_load
|
2014-09-26 15:16:58 +08:00
|
|
|
321 common bpf sys_bpf
|
2016-01-29 07:11:26 +08:00
|
|
|
322 64 execveat sys_execveat/ptregs
|
2015-09-05 06:46:58 +08:00
|
|
|
323 common userfaultfd sys_userfaultfd
|
sys_membarrier(): system-wide memory barrier (generic, x86)
Here is an implementation of a new system call, sys_membarrier(), which
executes a memory barrier on all threads running on the system. It is
implemented by calling synchronize_sched(). It can be used to
distribute the cost of user-space memory barriers asymmetrically by
transforming pairs of memory barriers into pairs consisting of
sys_membarrier() and a compiler barrier. For synchronization primitives
that distinguish between read-side and write-side (e.g. userspace RCU
[1], rwlocks), the read-side can be accelerated significantly by moving
the bulk of the memory barrier overhead to the write-side.
The existing applications of which I am aware that would be improved by
this system call are as follows:
* Through Userspace RCU library (http://urcu.so)
- DNS server (Knot DNS) https://www.knot-dns.cz/
- Network sniffer (http://netsniff-ng.org/)
- Distributed object storage (https://sheepdog.github.io/sheepdog/)
- User-space tracing (http://lttng.org)
- Network storage system (https://www.gluster.org/)
- Virtual routers (https://events.linuxfoundation.org/sites/events/files/slides/DPDK_RCU_0MQ.pdf)
- Financial software (https://lkml.org/lkml/2015/3/23/189)
Those projects use RCU in userspace to increase read-side speed and
scalability compared to locking. Especially in the case of RCU used by
libraries, sys_membarrier can speed up the read-side by moving the bulk of
the memory barrier cost to synchronize_rcu().
* Direct users of sys_membarrier
- core dotnet garbage collector (https://github.com/dotnet/coreclr/issues/198)
Microsoft core dotnet GC developers are planning to use the mprotect()
side-effect of issuing memory barriers through IPIs as a way to implement
Windows FlushProcessWriteBuffers() on Linux. They are referring to
sys_membarrier in their github thread, specifically stating that
sys_membarrier() is what they are looking for.
To explain the benefit of this scheme, let's introduce two example threads:
Thread A (non-frequent, e.g. executing liburcu synchronize_rcu())
Thread B (frequent, e.g. executing liburcu
rcu_read_lock()/rcu_read_unlock())
In a scheme where all smp_mb() in thread A are ordering memory accesses
with respect to smp_mb() present in Thread B, we can change each
smp_mb() within Thread A into calls to sys_membarrier() and each
smp_mb() within Thread B into compiler barriers "barrier()".
Before the change, we had, for each smp_mb() pairs:
Thread A Thread B
previous mem accesses previous mem accesses
smp_mb() smp_mb()
following mem accesses following mem accesses
After the change, these pairs become:
Thread A Thread B
prev mem accesses prev mem accesses
sys_membarrier() barrier()
follow mem accesses follow mem accesses
As we can see, there are two possible scenarios: either Thread B memory
accesses do not happen concurrently with Thread A accesses (1), or they
do (2).
1) Non-concurrent Thread A vs Thread B accesses:
Thread A Thread B
prev mem accesses
sys_membarrier()
follow mem accesses
prev mem accesses
barrier()
follow mem accesses
In this case, thread B accesses will be weakly ordered. This is OK,
because at that point, thread A is not particularly interested in
ordering them with respect to its own accesses.
2) Concurrent Thread A vs Thread B accesses
Thread A Thread B
prev mem accesses prev mem accesses
sys_membarrier() barrier()
follow mem accesses follow mem accesses
In this case, thread B accesses, which are ensured to be in program
order thanks to the compiler barrier, will be "upgraded" to full
smp_mb() by synchronize_sched().
* Benchmarks
On Intel Xeon E5405 (8 cores)
(one thread is calling sys_membarrier, the other 7 threads are busy
looping)
1000 non-expedited sys_membarrier calls in 33s =3D 33 milliseconds/call.
* User-space user of this system call: Userspace RCU library
Both the signal-based and the sys_membarrier userspace RCU schemes
permit us to remove the memory barrier from the userspace RCU
rcu_read_lock() and rcu_read_unlock() primitives, thus significantly
accelerating them. These memory barriers are replaced by compiler
barriers on the read-side, and all matching memory barriers on the
write-side are turned into an invocation of a memory barrier on all
active threads in the process. By letting the kernel perform this
synchronization rather than dumbly sending a signal to every process
threads (as we currently do), we diminish the number of unnecessary wake
ups and only issue the memory barriers on active threads. Non-running
threads do not need to execute such barrier anyway, because these are
implied by the scheduler context switches.
Results in liburcu:
Operations in 10s, 6 readers, 2 writers:
memory barriers in reader: 1701557485 reads, 2202847 writes
signal-based scheme: 9830061167 reads, 6700 writes
sys_membarrier: 9952759104 reads, 425 writes
sys_membarrier (dyn. check): 7970328887 reads, 425 writes
The dynamic sys_membarrier availability check adds some overhead to
the read-side compared to the signal-based scheme, but besides that,
sys_membarrier slightly outperforms the signal-based scheme. However,
this non-expedited sys_membarrier implementation has a much slower grace
period than signal and memory barrier schemes.
Besides diminishing the number of wake-ups, one major advantage of the
membarrier system call over the signal-based scheme is that it does not
need to reserve a signal. This plays much more nicely with libraries,
and with processes injected into for tracing purposes, for which we
cannot expect that signals will be unused by the application.
An expedited version of this system call can be added later on to speed
up the grace period. Its implementation will likely depend on reading
the cpu_curr()->mm without holding each CPU's rq lock.
This patch adds the system call to x86 and to asm-generic.
[1] http://urcu.so
membarrier(2) man page:
MEMBARRIER(2) Linux Programmer's Manual MEMBARRIER(2)
NAME
membarrier - issue memory barriers on a set of threads
SYNOPSIS
#include <linux/membarrier.h>
int membarrier(int cmd, int flags);
DESCRIPTION
The cmd argument is one of the following:
MEMBARRIER_CMD_QUERY
Query the set of supported commands. It returns a bitmask of
supported commands.
MEMBARRIER_CMD_SHARED
Execute a memory barrier on all threads running on the system.
Upon return from system call, the caller thread is ensured that
all running threads have passed through a state where all memory
accesses to user-space addresses match program order between
entry to and return from the system call (non-running threads
are de facto in such a state). This covers threads from all pro=E2=80=90
cesses running on the system. This command returns 0.
The flags argument needs to be 0. For future extensions.
All memory accesses performed in program order from each targeted
thread is guaranteed to be ordered with respect to sys_membarrier(). If
we use the semantic "barrier()" to represent a compiler barrier forcing
memory accesses to be performed in program order across the barrier,
and smp_mb() to represent explicit memory barriers forcing full memory
ordering across the barrier, we have the following ordering table for
each pair of barrier(), sys_membarrier() and smp_mb():
The pair ordering is detailed as (O: ordered, X: not ordered):
barrier() smp_mb() sys_membarrier()
barrier() X X O
smp_mb() X O O
sys_membarrier() O O O
RETURN VALUE
On success, these system calls return zero. On error, -1 is returned,
and errno is set appropriately. For a given command, with flags
argument set to 0, this system call is guaranteed to always return the
same value until reboot.
ERRORS
ENOSYS System call is not implemented.
EINVAL Invalid arguments.
Linux 2015-04-15 MEMBARRIER(2)
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Reviewed-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Nicholas Miell <nmiell@comcast.net>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Alan Cox <gnomes@lxorguk.ukuu.org.uk>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Stephen Hemminger <stephen@networkplumber.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Pranith Kumar <bobby.prani@gmail.com>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: Shuah Khan <shuahkh@osg.samsung.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-09-12 04:07:39 +08:00
|
|
|
324 common membarrier sys_membarrier
|
2015-11-06 10:51:33 +08:00
|
|
|
325 common mlock2 sys_mlock2
|
2015-11-11 05:53:31 +08:00
|
|
|
326 common copy_file_range sys_copy_file_range
|
2016-03-03 23:04:00 +08:00
|
|
|
327 64 preadv2 sys_preadv2
|
|
|
|
328 64 pwritev2 sys_pwritev2
|
2016-07-30 00:30:17 +08:00
|
|
|
329 common pkey_mprotect sys_pkey_mprotect
|
|
|
|
330 common pkey_alloc sys_pkey_alloc
|
|
|
|
331 common pkey_free sys_pkey_free
|
statx: Add a system call to make enhanced file info available
Add a system call to make extended file information available, including
file creation and some attribute flags where available through the
underlying filesystem.
The getattr inode operation is altered to take two additional arguments: a
u32 request_mask and an unsigned int flags that indicate the
synchronisation mode. This change is propagated to the vfs_getattr*()
function.
Functions like vfs_stat() are now inline wrappers around new functions
vfs_statx() and vfs_statx_fd() to reduce stack usage.
========
OVERVIEW
========
The idea was initially proposed as a set of xattrs that could be retrieved
with getxattr(), but the general preference proved to be for a new syscall
with an extended stat structure.
A number of requests were gathered for features to be included. The
following have been included:
(1) Make the fields a consistent size on all arches and make them large.
(2) Spare space, request flags and information flags are provided for
future expansion.
(3) Better support for the y2038 problem [Arnd Bergmann] (tv_sec is an
__s64).
(4) Creation time: The SMB protocol carries the creation time, which could
be exported by Samba, which will in turn help CIFS make use of
FS-Cache as that can be used for coherency data (stx_btime).
This is also specified in NFSv4 as a recommended attribute and could
be exported by NFSD [Steve French].
(5) Lightweight stat: Ask for just those details of interest, and allow a
netfs (such as NFS) to approximate anything not of interest, possibly
without going to the server [Trond Myklebust, Ulrich Drepper, Andreas
Dilger] (AT_STATX_DONT_SYNC).
(6) Heavyweight stat: Force a netfs to go to the server, even if it thinks
its cached attributes are up to date [Trond Myklebust]
(AT_STATX_FORCE_SYNC).
And the following have been left out for future extension:
(7) Data version number: Could be used by userspace NFS servers [Aneesh
Kumar].
Can also be used to modify fill_post_wcc() in NFSD which retrieves
i_version directly, but has just called vfs_getattr(). It could get
it from the kstat struct if it used vfs_xgetattr() instead.
(There's disagreement on the exact semantics of a single field, since
not all filesystems do this the same way).
(8) BSD stat compatibility: Including more fields from the BSD stat such
as creation time (st_btime) and inode generation number (st_gen)
[Jeremy Allison, Bernd Schubert].
(9) Inode generation number: Useful for FUSE and userspace NFS servers
[Bernd Schubert].
(This was asked for but later deemed unnecessary with the
open-by-handle capability available and caused disagreement as to
whether it's a security hole or not).
(10) Extra coherency data may be useful in making backups [Andreas Dilger].
(No particular data were offered, but things like last backup
timestamp, the data version number and the DOS archive bit would come
into this category).
(11) Allow the filesystem to indicate what it can/cannot provide: A
filesystem can now say it doesn't support a standard stat feature if
that isn't available, so if, for instance, inode numbers or UIDs don't
exist or are fabricated locally...
(This requires a separate system call - I have an fsinfo() call idea
for this).
(12) Store a 16-byte volume ID in the superblock that can be returned in
struct xstat [Steve French].
(Deferred to fsinfo).
(13) Include granularity fields in the time data to indicate the
granularity of each of the times (NFSv4 time_delta) [Steve French].
(Deferred to fsinfo).
(14) FS_IOC_GETFLAGS value. These could be translated to BSD's st_flags.
Note that the Linux IOC flags are a mess and filesystems such as Ext4
define flags that aren't in linux/fs.h, so translation in the kernel
may be a necessity (or, possibly, we provide the filesystem type too).
(Some attributes are made available in stx_attributes, but the general
feeling was that the IOC flags were to ext[234]-specific and shouldn't
be exposed through statx this way).
(15) Mask of features available on file (eg: ACLs, seclabel) [Brad Boyer,
Michael Kerrisk].
(Deferred, probably to fsinfo. Finding out if there's an ACL or
seclabal might require extra filesystem operations).
(16) Femtosecond-resolution timestamps [Dave Chinner].
(A __reserved field has been left in the statx_timestamp struct for
this - if there proves to be a need).
(17) A set multiple attributes syscall to go with this.
===============
NEW SYSTEM CALL
===============
The new system call is:
int ret = statx(int dfd,
const char *filename,
unsigned int flags,
unsigned int mask,
struct statx *buffer);
The dfd, filename and flags parameters indicate the file to query, in a
similar way to fstatat(). There is no equivalent of lstat() as that can be
emulated with statx() by passing AT_SYMLINK_NOFOLLOW in flags. There is
also no equivalent of fstat() as that can be emulated by passing a NULL
filename to statx() with the fd of interest in dfd.
Whether or not statx() synchronises the attributes with the backing store
can be controlled by OR'ing a value into the flags argument (this typically
only affects network filesystems):
(1) AT_STATX_SYNC_AS_STAT tells statx() to behave as stat() does in this
respect.
(2) AT_STATX_FORCE_SYNC will require a network filesystem to synchronise
its attributes with the server - which might require data writeback to
occur to get the timestamps correct.
(3) AT_STATX_DONT_SYNC will suppress synchronisation with the server in a
network filesystem. The resulting values should be considered
approximate.
mask is a bitmask indicating the fields in struct statx that are of
interest to the caller. The user should set this to STATX_BASIC_STATS to
get the basic set returned by stat(). It should be noted that asking for
more information may entail extra I/O operations.
buffer points to the destination for the data. This must be 256 bytes in
size.
======================
MAIN ATTRIBUTES RECORD
======================
The following structures are defined in which to return the main attribute
set:
struct statx_timestamp {
__s64 tv_sec;
__s32 tv_nsec;
__s32 __reserved;
};
struct statx {
__u32 stx_mask;
__u32 stx_blksize;
__u64 stx_attributes;
__u32 stx_nlink;
__u32 stx_uid;
__u32 stx_gid;
__u16 stx_mode;
__u16 __spare0[1];
__u64 stx_ino;
__u64 stx_size;
__u64 stx_blocks;
__u64 __spare1[1];
struct statx_timestamp stx_atime;
struct statx_timestamp stx_btime;
struct statx_timestamp stx_ctime;
struct statx_timestamp stx_mtime;
__u32 stx_rdev_major;
__u32 stx_rdev_minor;
__u32 stx_dev_major;
__u32 stx_dev_minor;
__u64 __spare2[14];
};
The defined bits in request_mask and stx_mask are:
STATX_TYPE Want/got stx_mode & S_IFMT
STATX_MODE Want/got stx_mode & ~S_IFMT
STATX_NLINK Want/got stx_nlink
STATX_UID Want/got stx_uid
STATX_GID Want/got stx_gid
STATX_ATIME Want/got stx_atime{,_ns}
STATX_MTIME Want/got stx_mtime{,_ns}
STATX_CTIME Want/got stx_ctime{,_ns}
STATX_INO Want/got stx_ino
STATX_SIZE Want/got stx_size
STATX_BLOCKS Want/got stx_blocks
STATX_BASIC_STATS [The stuff in the normal stat struct]
STATX_BTIME Want/got stx_btime{,_ns}
STATX_ALL [All currently available stuff]
stx_btime is the file creation time, stx_mask is a bitmask indicating the
data provided and __spares*[] are where as-yet undefined fields can be
placed.
Time fields are structures with separate seconds and nanoseconds fields
plus a reserved field in case we want to add even finer resolution. Note
that times will be negative if before 1970; in such a case, the nanosecond
fields will also be negative if not zero.
The bits defined in the stx_attributes field convey information about a
file, how it is accessed, where it is and what it does. The following
attributes map to FS_*_FL flags and are the same numerical value:
STATX_ATTR_COMPRESSED File is compressed by the fs
STATX_ATTR_IMMUTABLE File is marked immutable
STATX_ATTR_APPEND File is append-only
STATX_ATTR_NODUMP File is not to be dumped
STATX_ATTR_ENCRYPTED File requires key to decrypt in fs
Within the kernel, the supported flags are listed by:
KSTAT_ATTR_FS_IOC_FLAGS
[Are any other IOC flags of sufficient general interest to be exposed
through this interface?]
New flags include:
STATX_ATTR_AUTOMOUNT Object is an automount trigger
These are for the use of GUI tools that might want to mark files specially,
depending on what they are.
Fields in struct statx come in a number of classes:
(0) stx_dev_*, stx_blksize.
These are local system information and are always available.
(1) stx_mode, stx_nlinks, stx_uid, stx_gid, stx_[amc]time, stx_ino,
stx_size, stx_blocks.
These will be returned whether the caller asks for them or not. The
corresponding bits in stx_mask will be set to indicate whether they
actually have valid values.
If the caller didn't ask for them, then they may be approximated. For
example, NFS won't waste any time updating them from the server,
unless as a byproduct of updating something requested.
If the values don't actually exist for the underlying object (such as
UID or GID on a DOS file), then the bit won't be set in the stx_mask,
even if the caller asked for the value. In such a case, the returned
value will be a fabrication.
Note that there are instances where the type might not be valid, for
instance Windows reparse points.
(2) stx_rdev_*.
This will be set only if stx_mode indicates we're looking at a
blockdev or a chardev, otherwise will be 0.
(3) stx_btime.
Similar to (1), except this will be set to 0 if it doesn't exist.
=======
TESTING
=======
The following test program can be used to test the statx system call:
samples/statx/test-statx.c
Just compile and run, passing it paths to the files you want to examine.
The file is built automatically if CONFIG_SAMPLES is enabled.
Here's some example output. Firstly, an NFS directory that crosses to
another FSID. Note that the AUTOMOUNT attribute is set because transiting
this directory will cause d_automount to be invoked by the VFS.
[root@andromeda ~]# /tmp/test-statx -A /warthog/data
statx(/warthog/data) = 0
results=7ff
Size: 4096 Blocks: 8 IO Block: 1048576 directory
Device: 00:26 Inode: 1703937 Links: 125
Access: (3777/drwxrwxrwx) Uid: 0 Gid: 4041
Access: 2016-11-24 09:02:12.219699527+0000
Modify: 2016-11-17 10:44:36.225653653+0000
Change: 2016-11-17 10:44:36.225653653+0000
Attributes: 0000000000001000 (-------- -------- -------- -------- -------- -------- ---m---- --------)
Secondly, the result of automounting on that directory.
[root@andromeda ~]# /tmp/test-statx /warthog/data
statx(/warthog/data) = 0
results=7ff
Size: 4096 Blocks: 8 IO Block: 1048576 directory
Device: 00:27 Inode: 2 Links: 125
Access: (3777/drwxrwxrwx) Uid: 0 Gid: 4041
Access: 2016-11-24 09:02:12.219699527+0000
Modify: 2016-11-17 10:44:36.225653653+0000
Change: 2016-11-17 10:44:36.225653653+0000
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2017-02-01 00:46:22 +08:00
|
|
|
332 common statx sys_statx
|
2012-06-01 07:26:44 +08:00
|
|
|
|
2012-02-15 06:18:50 +08:00
|
|
|
#
|
|
|
|
# x32-specific system call numbers start at 512 to avoid cache impact
|
|
|
|
# for native 64-bit operation.
|
|
|
|
#
|
2012-12-26 07:42:26 +08:00
|
|
|
512 x32 rt_sigaction compat_sys_rt_sigaction
|
2016-01-29 07:11:26 +08:00
|
|
|
513 x32 rt_sigreturn sys32_x32_rt_sigreturn
|
2012-02-15 06:18:50 +08:00
|
|
|
514 x32 ioctl compat_sys_ioctl
|
|
|
|
515 x32 readv compat_sys_readv
|
|
|
|
516 x32 writev compat_sys_writev
|
|
|
|
517 x32 recvfrom compat_sys_recvfrom
|
|
|
|
518 x32 sendmsg compat_sys_sendmsg
|
|
|
|
519 x32 recvmsg compat_sys_recvmsg
|
2016-01-29 07:11:26 +08:00
|
|
|
520 x32 execve compat_sys_execve/ptregs
|
2012-03-06 07:32:11 +08:00
|
|
|
521 x32 ptrace compat_sys_ptrace
|
2012-12-26 03:46:17 +08:00
|
|
|
522 x32 rt_sigpending compat_sys_rt_sigpending
|
2012-02-15 06:18:50 +08:00
|
|
|
523 x32 rt_sigtimedwait compat_sys_rt_sigtimedwait
|
2012-12-26 04:26:55 +08:00
|
|
|
524 x32 rt_sigqueueinfo compat_sys_rt_sigqueueinfo
|
2012-12-15 03:47:53 +08:00
|
|
|
525 x32 sigaltstack compat_sys_sigaltstack
|
2012-02-15 06:18:50 +08:00
|
|
|
526 x32 timer_create compat_sys_timer_create
|
|
|
|
527 x32 mq_notify compat_sys_mq_notify
|
|
|
|
528 x32 kexec_load compat_sys_kexec_load
|
|
|
|
529 x32 waitid compat_sys_waitid
|
|
|
|
530 x32 set_robust_list compat_sys_set_robust_list
|
|
|
|
531 x32 get_robust_list compat_sys_get_robust_list
|
|
|
|
532 x32 vmsplice compat_sys_vmsplice
|
|
|
|
533 x32 move_pages compat_sys_move_pages
|
|
|
|
534 x32 preadv compat_sys_preadv64
|
|
|
|
535 x32 pwritev compat_sys_pwritev64
|
|
|
|
536 x32 rt_tgsigqueueinfo compat_sys_rt_tgsigqueueinfo
|
|
|
|
537 x32 recvmmsg compat_sys_recvmmsg
|
|
|
|
538 x32 sendmmsg compat_sys_sendmmsg
|
|
|
|
539 x32 process_vm_readv compat_sys_process_vm_readv
|
|
|
|
540 x32 process_vm_writev compat_sys_process_vm_writev
|
x32: Use compat shims for {g,s}etsockopt
Some of the arguments to {g,s}etsockopt are passed in userland pointers.
If we try to use the 64bit entry point, we end up sometimes failing.
For example, dhcpcd doesn't run in x32:
# dhcpcd eth0
dhcpcd[1979]: version 5.5.6 starting
dhcpcd[1979]: eth0: broadcasting for a lease
dhcpcd[1979]: eth0: open_socket: Invalid argument
dhcpcd[1979]: eth0: send_raw_packet: Bad file descriptor
The code in particular is getting back EINVAL when doing:
struct sock_fprog pf;
setsockopt(s, SOL_SOCKET, SO_ATTACH_FILTER, &pf, sizeof(pf));
Diving into the kernel code, we can see:
include/linux/filter.h:
struct sock_fprog {
unsigned short len;
struct sock_filter __user *filter;
};
net/core/sock.c:
case SO_ATTACH_FILTER:
ret = -EINVAL;
if (optlen == sizeof(struct sock_fprog)) {
struct sock_fprog fprog;
ret = -EFAULT;
if (copy_from_user(&fprog, optval, sizeof(fprog)))
break;
ret = sk_attach_filter(&fprog, sk);
}
break;
arch/x86/syscalls/syscall_64.tbl:
54 common setsockopt sys_setsockopt
55 common getsockopt sys_getsockopt
So for x64, sizeof(sock_fprog) is 16 bytes. For x86/x32, it's 8 bytes.
This comes down to the pointer being 32bit for x32, which means we need
to do structure size translation. But since x32 comes in directly to
sys_setsockopt, it doesn't get translated like x86.
After changing the syscall table and rebuilding glibc with the new kernel
headers, dhcp runs fine in an x32 userland.
Oddly, it seems like Linus noted the same thing during the initial port,
but I guess that was missed/lost along the way:
https://lkml.org/lkml/2011/8/26/452
[ hpa: tagging for -stable since this is an ABI fix. ]
Bugzilla: https://bugs.gentoo.org/423649
Reported-by: Mads <mads@ab3.no>
Signed-off-by: Mike Frysinger <vapier@gentoo.org>
Link: http://lkml.kernel.org/r/1345320697-15713-1-git-send-email-vapier@gentoo.org
Cc: H. J. Lu <hjl.tools@gmail.com>
Cc: <stable@vger.kernel.org> v3.4..v3.5
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
2012-08-19 04:11:37 +08:00
|
|
|
541 x32 setsockopt compat_sys_setsockopt
|
|
|
|
542 x32 getsockopt compat_sys_getsockopt
|
2014-05-05 08:43:15 +08:00
|
|
|
543 x32 io_setup compat_sys_io_setup
|
|
|
|
544 x32 io_submit compat_sys_io_submit
|
2016-01-29 07:11:26 +08:00
|
|
|
545 x32 execveat compat_sys_execveat/ptregs
|
2016-07-15 03:31:53 +08:00
|
|
|
546 x32 preadv2 compat_sys_preadv64v2
|
|
|
|
547 x32 pwritev2 compat_sys_pwritev64v2
|