Print this page
104 Bring back lx brand
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/i86pc/ml/syscall_asm_amd64.s
+++ new/usr/src/uts/i86pc/ml/syscall_asm_amd64.s
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
23 23 */
24 24
25 25 #include <sys/asm_linkage.h>
26 26 #include <sys/asm_misc.h>
27 27 #include <sys/regset.h>
28 28 #include <sys/privregs.h>
29 29 #include <sys/psw.h>
30 30 #include <sys/machbrand.h>
31 31
32 32 #if defined(__lint)
33 33
34 34 #include <sys/types.h>
35 35 #include <sys/thread.h>
36 36 #include <sys/systm.h>
37 37
38 38 #else /* __lint */
39 39
40 40 #include <sys/segments.h>
41 41 #include <sys/pcb.h>
42 42 #include <sys/trap.h>
43 43 #include <sys/ftrace.h>
44 44 #include <sys/traptrace.h>
45 45 #include <sys/clock.h>
46 46 #include <sys/model.h>
47 47 #include <sys/panic.h>
48 48
49 49 #if defined(__xpv)
50 50 #include <sys/hypervisor.h>
51 51 #endif
52 52
53 53 #include "assym.h"
54 54
55 55 #endif /* __lint */
56 56
57 57 /*
58 58 * We implement five flavours of system call entry points
59 59 *
60 60 * - syscall/sysretq (amd64 generic)
61 61 * - syscall/sysretl (i386 plus SYSC bit)
62 62 * - sysenter/sysexit (i386 plus SEP bit)
63 63 * - int/iret (i386 generic)
64 64 * - lcall/iret (i386 generic)
65 65 *
66 66 * The current libc included in Solaris uses int/iret as the base unoptimized
67 67 * kernel entry method. Older libc implementations and legacy binaries may use
68 68 * the lcall call gate, so it must continue to be supported.
69 69 *
70 70 * System calls that use an lcall call gate are processed in trap() via a
71 71 * segment-not-present trap, i.e. lcalls are extremely slow(!).
72 72 *
73 73 * The basic pattern used in the 32-bit SYSC handler at this point in time is
74 74 * to have the bare minimum of assembler, and get to the C handlers as
75 75 * quickly as possible.
76 76 *
77 77 * The 64-bit handler is much closer to the sparcv9 handler; that's
78 78 * because of passing arguments in registers. The 32-bit world still
79 79 * passes arguments on the stack -- that makes that handler substantially
80 80 * more complex.
81 81 *
82 82 * The two handlers share a few code fragments which are broken
83 83 * out into preprocessor macros below.
84 84 *
85 85 * XX64 come back and speed all this up later. The 32-bit stuff looks
86 86 * especially easy to speed up the argument copying part ..
87 87 *
88 88 *
89 89 * Notes about segment register usage (c.f. the 32-bit kernel)
90 90 *
91 91 * In the 32-bit kernel, segment registers are dutifully saved and
92 92 * restored on all mode transitions because the kernel uses them directly.
93 93 * When the processor is running in 64-bit mode, segment registers are
94 94 * largely ignored.
95 95 *
96 96 * %cs and %ss
97 97 * controlled by the hardware mechanisms that make mode transitions
98 98 *
99 99 * The remaining segment registers have to either be pointing at a valid
100 100 * descriptor i.e. with the 'present' bit set, or they can NULL descriptors
101 101 *
102 102 * %ds and %es
103 103 * always ignored
104 104 *
105 105 * %fs and %gs
106 106 * fsbase and gsbase are used to control the place they really point at.
107 107 * The kernel only depends on %gs, and controls its own gsbase via swapgs
108 108 *
109 109 * Note that loading segment registers is still costly because the GDT
110 110 * lookup still happens (this is because the hardware can't know that we're
111 111 * not setting up these segment registers for a 32-bit program). Thus we
112 112 * avoid doing this in the syscall path, and defer them to lwp context switch
113 113 * handlers, so the register values remain virtualized to the lwp.
114 114 */
115 115
116 116 #if defined(SYSCALLTRACE)
117 117 #define ORL_SYSCALLTRACE(r32) \
118 118 orl syscalltrace(%rip), r32
119 119 #else
120 120 #define ORL_SYSCALLTRACE(r32)
121 121 #endif
122 122
123 123 /*
124 124 * In the 32-bit kernel, we do absolutely nothing before getting into the
125 125 * brand callback checks. In 64-bit land, we do swapgs and then come here.
126 126 * We assume that the %rsp- and %r15-stashing fields in the CPU structure
127 127 * are still unused.
128 128 *
129 129 * Check if a brand_mach_ops callback is defined for the specified callback_id
130 130 * type. If so invoke it with the kernel's %gs value loaded and the following
131 131 * data on the stack:
132 132 *
133 133 * stack: --------------------------------------
134 134 * 32 | callback pointer |
135 135 * | 24 | user (or interrupt) stack pointer |
136 136 * | 16 | lwp pointer |
137 137 * v 8 | userland return address |
138 138 * 0 | callback wrapper return addr |
139 139 * --------------------------------------
140 140 *
141 141 * Since we're pushing the userland return address onto the kernel stack
142 142 * we need to get that address without accessing the user's stack (since we
143 143 * can't trust that data). There are different ways to get the userland
144 144 * return address depending on how the syscall trap was made:
145 145 *
146 146 * a) For sys_syscall and sys_syscall32 the return address is in %rcx.
147 147 * b) For sys_sysenter the return address is in %rdx.
148 148 * c) For sys_int80 and sys_syscall_int (int91), upon entry into the macro,
149 149 * the stack pointer points at the state saved when we took the interrupt:
150 150 * ------------------------
151 151 * | | user's %ss |
152 152 * | | user's %esp |
153 153 * | | EFLAGS register |
154 154 * v | user's %cs |
155 155 * | user's %eip |
156 156 * ------------------------
157 157 *
158 158 * The 2nd parameter to the BRAND_CALLBACK macro is either the
159 159 * BRAND_URET_FROM_REG or BRAND_URET_FROM_INTR_STACK macro. These macros are
160 160 * used to generate the proper code to get the userland return address for
161 161 * each syscall entry point.
162 162 *
163 163 * The interface to the brand callbacks on the 64-bit kernel assumes %r15
164 164 * is available as a scratch register within the callback. If the callback
165 165 * returns within the kernel then this macro will restore %r15. If the
166 166 * callback is going to return directly to userland then it should restore
167 167 * %r15 before returning to userland.
168 168 */
169 169 #define BRAND_URET_FROM_REG(rip_reg) \
170 170 pushq rip_reg /* push the return address */
171 171
172 172 /*
173 173 * The interrupt stack pointer we saved on entry to the BRAND_CALLBACK macro
174 174 * is currently pointing at the user return address (%eip).
175 175 */
176 176 #define BRAND_URET_FROM_INTR_STACK() \
177 177 movq %gs:CPU_RTMP_RSP, %r15 /* grab the intr. stack pointer */ ;\
178 178 pushq (%r15) /* push the return address */
179 179
180 180 #define BRAND_CALLBACK(callback_id, push_userland_ret) \
181 181 movq %rsp, %gs:CPU_RTMP_RSP /* save the stack pointer */ ;\
182 182 movq %r15, %gs:CPU_RTMP_R15 /* save %r15 */ ;\
183 183 movq %gs:CPU_THREAD, %r15 /* load the thread pointer */ ;\
184 184 movq T_STACK(%r15), %rsp /* switch to the kernel stack */ ;\
185 185 subq $16, %rsp /* save space for 2 pointers */ ;\
186 186 pushq %r14 /* save %r14 */ ;\
187 187 movq %gs:CPU_RTMP_RSP, %r14 ;\
188 188 movq %r14, 8(%rsp) /* stash the user stack pointer */ ;\
189 189 popq %r14 /* restore %r14 */ ;\
190 190 movq T_LWP(%r15), %r15 /* load the lwp pointer */ ;\
191 191 pushq %r15 /* push the lwp pointer */ ;\
192 192 movq LWP_PROCP(%r15), %r15 /* load the proc pointer */ ;\
193 193 movq P_BRAND(%r15), %r15 /* load the brand pointer */ ;\
194 194 movq B_MACHOPS(%r15), %r15 /* load the machops pointer */ ;\
195 195 movq _CONST(_MUL(callback_id, CPTRSIZE))(%r15), %r15 ;\
196 196 cmpq $0, %r15 ;\
197 197 je 1f ;\
198 198 movq %r15, 16(%rsp) /* save the callback pointer */ ;\
199 199 push_userland_ret /* push the return address */ ;\
200 200 call *24(%rsp) /* call callback */ ;\
201 201 1: movq %gs:CPU_RTMP_R15, %r15 /* restore %r15 */ ;\
202 202 movq %gs:CPU_RTMP_RSP, %rsp /* restore the stack pointer */
203 203
204 204 #define MSTATE_TRANSITION(from, to) \
205 205 movl $from, %edi; \
206 206 movl $to, %esi; \
207 207 call syscall_mstate
208 208
209 209 /*
210 210 * Check to see if a simple (direct) return is possible i.e.
211 211 *
212 212 * if (t->t_post_sys_ast | syscalltrace |
213 213 * lwp->lwp_pcb.pcb_rupdate == 1)
214 214 * do full version ;
215 215 *
216 216 * Preconditions:
217 217 * - t is curthread
218 218 * Postconditions:
219 219 * - condition code NE is set if post-sys is too complex
220 220 * - rtmp is zeroed if it isn't (we rely on this!)
221 221 * - ltmp is smashed
222 222 */
223 223 #define CHECK_POSTSYS_NE(t, ltmp, rtmp) \
224 224 movq T_LWP(t), ltmp; \
225 225 movzbl PCB_RUPDATE(ltmp), rtmp; \
226 226 ORL_SYSCALLTRACE(rtmp); \
227 227 orl T_POST_SYS_AST(t), rtmp; \
228 228 cmpl $0, rtmp
229 229
230 230 /*
231 231 * Fix up the lwp, thread, and eflags for a successful return
232 232 *
233 233 * Preconditions:
234 234 * - zwreg contains zero
235 235 */
236 236 #define SIMPLE_SYSCALL_POSTSYS(t, lwp, zwreg) \
237 237 movb $LWP_USER, LWP_STATE(lwp); \
238 238 movw zwreg, T_SYSNUM(t); \
239 239 andb $_CONST(0xffff - PS_C), REGOFF_RFL(%rsp)
240 240
241 241 /*
242 242 * ASSERT(lwptoregs(lwp) == rp);
243 243 *
244 244 * This may seem obvious, but very odd things happen if this
245 245 * assertion is false
246 246 *
247 247 * Preconditions:
248 248 * (%rsp is ready for normal call sequence)
249 249 * Postconditions (if assertion is true):
250 250 * %r11 is smashed
251 251 *
252 252 * ASSERT(rp->r_cs == descnum)
253 253 *
254 254 * The code selector is written into the regs structure when the
255 255 * lwp stack is created. We use this ASSERT to validate that
256 256 * the regs structure really matches how we came in.
257 257 *
258 258 * Preconditions:
259 259 * (%rsp is ready for normal call sequence)
260 260 * Postconditions (if assertion is true):
261 261 * -none-
262 262 *
263 263 * ASSERT(lwp->lwp_pcb.pcb_rupdate == 0);
264 264 *
265 265 * If this is false, it meant that we returned to userland without
266 266 * updating the segment registers as we were supposed to.
267 267 *
268 268 * Note that we must ensure no interrupts or other traps intervene
269 269 * between entering privileged mode and performing the assertion,
270 270 * otherwise we may perform a context switch on the thread, which
271 271 * will end up setting pcb_rupdate to 1 again.
272 272 */
273 273 #if defined(DEBUG)
274 274
275 275 #if !defined(__lint)
276 276
277 277 __lwptoregs_msg:
278 278 .string "syscall_asm_amd64.s:%d lwptoregs(%p) [%p] != rp [%p]"
279 279
280 280 __codesel_msg:
281 281 .string "syscall_asm_amd64.s:%d rp->r_cs [%ld] != %ld"
282 282
283 283 __no_rupdate_msg:
284 284 .string "syscall_asm_amd64.s:%d lwp %p, pcb_rupdate != 0"
285 285
286 286 #endif /* !__lint */
287 287
288 288 #define ASSERT_LWPTOREGS(lwp, rp) \
289 289 movq LWP_REGS(lwp), %r11; \
290 290 cmpq rp, %r11; \
291 291 je 7f; \
292 292 leaq __lwptoregs_msg(%rip), %rdi; \
293 293 movl $__LINE__, %esi; \
294 294 movq lwp, %rdx; \
295 295 movq %r11, %rcx; \
296 296 movq rp, %r8; \
297 297 xorl %eax, %eax; \
298 298 call panic; \
299 299 7:
300 300
301 301 #define ASSERT_NO_RUPDATE_PENDING(lwp) \
302 302 testb $0x1, PCB_RUPDATE(lwp); \
303 303 je 8f; \
304 304 movq lwp, %rdx; \
305 305 leaq __no_rupdate_msg(%rip), %rdi; \
306 306 movl $__LINE__, %esi; \
307 307 xorl %eax, %eax; \
308 308 call panic; \
309 309 8:
310 310
311 311 #else
312 312 #define ASSERT_LWPTOREGS(lwp, rp)
313 313 #define ASSERT_NO_RUPDATE_PENDING(lwp)
314 314 #endif
315 315
316 316 /*
317 317 * Do the traptrace thing and restore any registers we used
318 318 * in situ. Assumes that %rsp is pointing at the base of
319 319 * the struct regs, obviously ..
320 320 */
321 321 #ifdef TRAPTRACE
322 322 #define SYSCALL_TRAPTRACE(ttype) \
323 323 TRACE_PTR(%rdi, %rbx, %ebx, %rcx, ttype); \
324 324 TRACE_REGS(%rdi, %rsp, %rbx, %rcx); \
325 325 TRACE_STAMP(%rdi); /* rdtsc clobbers %eax, %edx */ \
326 326 movq REGOFF_RAX(%rsp), %rax; \
327 327 movq REGOFF_RBX(%rsp), %rbx; \
328 328 movq REGOFF_RCX(%rsp), %rcx; \
329 329 movq REGOFF_RDX(%rsp), %rdx; \
330 330 movl %eax, TTR_SYSNUM(%rdi); \
331 331 movq REGOFF_RDI(%rsp), %rdi
332 332
333 333 #define SYSCALL_TRAPTRACE32(ttype) \
334 334 SYSCALL_TRAPTRACE(ttype); \
335 335 /* paranoia: clean the top 32-bits of the registers */ \
336 336 orl %eax, %eax; \
337 337 orl %ebx, %ebx; \
338 338 orl %ecx, %ecx; \
339 339 orl %edx, %edx; \
340 340 orl %edi, %edi
341 341 #else /* TRAPTRACE */
342 342 #define SYSCALL_TRAPTRACE(ttype)
343 343 #define SYSCALL_TRAPTRACE32(ttype)
344 344 #endif /* TRAPTRACE */
345 345
346 346 /*
347 347 * The 64-bit libc syscall wrapper does this:
348 348 *
349 349 * fn(<args>)
350 350 * {
351 351 * movq %rcx, %r10 -- because syscall smashes %rcx
352 352 * movl $CODE, %eax
353 353 * syscall
354 354 * <error processing>
355 355 * }
356 356 *
357 357 * Thus when we come into the kernel:
358 358 *
359 359 * %rdi, %rsi, %rdx, %r10, %r8, %r9 contain first six args
360 360 * %rax is the syscall number
361 361 * %r12-%r15 contain caller state
362 362 *
363 363 * The syscall instruction arranges that:
364 364 *
365 365 * %rcx contains the return %rip
366 366 * %r11d contains bottom 32-bits of %rflags
367 367 * %rflags is masked (as determined by the SFMASK msr)
368 368 * %cs is set to UCS_SEL (as determined by the STAR msr)
369 369 * %ss is set to UDS_SEL (as determined by the STAR msr)
370 370 * %rip is set to sys_syscall (as determined by the LSTAR msr)
371 371 *
372 372 * Or in other words, we have no registers available at all.
373 373 * Only swapgs can save us!
374 374 *
375 375 * Under the hypervisor, the swapgs has happened already. However, the
376 376 * state of the world is very different from that we're familiar with.
377 377 *
378 378 * In particular, we have a stack structure like that for interrupt
379 379 * gates, except that the %cs and %ss registers are modified for reasons
380 380 * that are not entirely clear. Critically, the %rcx/%r11 values do
381 381 * *not* reflect the usage of those registers under a 'real' syscall[1];
382 382 * the stack, therefore, looks like this:
383 383 *
384 384 * 0x0(rsp) potentially junk %rcx
385 385 * 0x8(rsp) potentially junk %r11
386 386 * 0x10(rsp) user %rip
387 387 * 0x18(rsp) modified %cs
388 388 * 0x20(rsp) user %rflags
389 389 * 0x28(rsp) user %rsp
390 390 * 0x30(rsp) modified %ss
391 391 *
392 392 *
393 393 * and before continuing on, we must load the %rip into %rcx and the
394 394 * %rflags into %r11.
395 395 *
396 396 * [1] They used to, and we relied on it, but this was broken in 3.1.1.
397 397 * Sigh.
398 398 */
399 399 #if defined(__xpv)
400 400 #define XPV_SYSCALL_PROD \
401 401 movq 0x10(%rsp), %rcx; \
402 402 movq 0x20(%rsp), %r11; \
403 403 movq 0x28(%rsp), %rsp
404 404 #else
405 405 #define XPV_SYSCALL_PROD /* nothing */
406 406 #endif
407 407
408 408 #if defined(__lint)
409 409
410 410 /*ARGSUSED*/
411 411 void
412 412 sys_syscall()
413 413 {}
414 414
415 415 void
416 416 _allsyscalls()
417 417 {}
418 418
419 419 size_t _allsyscalls_size;
420 420
421 421 #else /* __lint */
422 422
423 423 ENTRY_NP2(brand_sys_syscall,_allsyscalls)
424 424 SWAPGS /* kernel gsbase */
425 425 XPV_SYSCALL_PROD
426 426 BRAND_CALLBACK(BRAND_CB_SYSCALL, BRAND_URET_FROM_REG(%rcx))
427 427 jmp noprod_sys_syscall
428 428
429 429 ALTENTRY(sys_syscall)
430 430 SWAPGS /* kernel gsbase */
431 431 XPV_SYSCALL_PROD
432 432
433 433 noprod_sys_syscall:
434 434 movq %r15, %gs:CPU_RTMP_R15
435 435 movq %rsp, %gs:CPU_RTMP_RSP
436 436
437 437 movq %gs:CPU_THREAD, %r15
438 438 movq T_STACK(%r15), %rsp /* switch from user to kernel stack */
439 439
440 440 ASSERT_UPCALL_MASK_IS_SET
441 441
442 442 movl $UCS_SEL, REGOFF_CS(%rsp)
443 443 movq %rcx, REGOFF_RIP(%rsp) /* syscall: %rip -> %rcx */
444 444 movq %r11, REGOFF_RFL(%rsp) /* syscall: %rfl -> %r11d */
445 445 movl $UDS_SEL, REGOFF_SS(%rsp)
446 446
447 447 movl %eax, %eax /* wrapper: sysc# -> %eax */
448 448 movq %rdi, REGOFF_RDI(%rsp)
449 449 movq %rsi, REGOFF_RSI(%rsp)
450 450 movq %rdx, REGOFF_RDX(%rsp)
451 451 movq %r10, REGOFF_RCX(%rsp) /* wrapper: %rcx -> %r10 */
452 452 movq %r10, %rcx /* arg[3] for direct calls */
453 453
454 454 movq %r8, REGOFF_R8(%rsp)
455 455 movq %r9, REGOFF_R9(%rsp)
456 456 movq %rax, REGOFF_RAX(%rsp)
457 457 movq %rbx, REGOFF_RBX(%rsp)
458 458
459 459 movq %rbp, REGOFF_RBP(%rsp)
460 460 movq %r10, REGOFF_R10(%rsp)
461 461 movq %gs:CPU_RTMP_RSP, %r11
462 462 movq %r11, REGOFF_RSP(%rsp)
463 463 movq %r12, REGOFF_R12(%rsp)
464 464
465 465 movq %r13, REGOFF_R13(%rsp)
466 466 movq %r14, REGOFF_R14(%rsp)
467 467 movq %gs:CPU_RTMP_R15, %r10
468 468 movq %r10, REGOFF_R15(%rsp)
469 469 movq $0, REGOFF_SAVFP(%rsp)
470 470 movq $0, REGOFF_SAVPC(%rsp)
471 471
472 472 /*
473 473 * Copy these registers here in case we end up stopped with
474 474 * someone (like, say, /proc) messing with our register state.
475 475 * We don't -restore- them unless we have to in update_sregs.
476 476 *
477 477 * Since userland -can't- change fsbase or gsbase directly,
478 478 * and capturing them involves two serializing instructions,
479 479 * we don't bother to capture them here.
480 480 */
481 481 xorl %ebx, %ebx
482 482 movw %ds, %bx
483 483 movq %rbx, REGOFF_DS(%rsp)
484 484 movw %es, %bx
485 485 movq %rbx, REGOFF_ES(%rsp)
486 486 movw %fs, %bx
487 487 movq %rbx, REGOFF_FS(%rsp)
488 488 movw %gs, %bx
489 489 movq %rbx, REGOFF_GS(%rsp)
490 490
491 491 /*
492 492 * Machine state saved in the regs structure on the stack
493 493 * First six args in %rdi, %rsi, %rdx, %rcx, %r8, %r9
494 494 * %eax is the syscall number
495 495 * %rsp is the thread's stack, %r15 is curthread
496 496 * REG_RSP(%rsp) is the user's stack
497 497 */
498 498
499 499 SYSCALL_TRAPTRACE($TT_SYSC64)
500 500
501 501 movq %rsp, %rbp
502 502
503 503 movq T_LWP(%r15), %r14
504 504 ASSERT_NO_RUPDATE_PENDING(%r14)
505 505 ENABLE_INTR_FLAGS
506 506
507 507 MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM)
508 508 movl REGOFF_RAX(%rsp), %eax /* (%rax damaged by mstate call) */
509 509
510 510 ASSERT_LWPTOREGS(%r14, %rsp)
511 511
512 512 movb $LWP_SYS, LWP_STATE(%r14)
513 513 incq LWP_RU_SYSC(%r14)
514 514 movb $NORMALRETURN, LWP_EOSYS(%r14)
515 515
516 516 incq %gs:CPU_STATS_SYS_SYSCALL
517 517
518 518 movw %ax, T_SYSNUM(%r15)
519 519 movzbl T_PRE_SYS(%r15), %ebx
520 520 ORL_SYSCALLTRACE(%ebx)
521 521 testl %ebx, %ebx
522 522 jne _syscall_pre
523 523
524 524 _syscall_invoke:
525 525 movq REGOFF_RDI(%rbp), %rdi
526 526 movq REGOFF_RSI(%rbp), %rsi
527 527 movq REGOFF_RDX(%rbp), %rdx
528 528 movq REGOFF_RCX(%rbp), %rcx
529 529 movq REGOFF_R8(%rbp), %r8
530 530 movq REGOFF_R9(%rbp), %r9
531 531
532 532 cmpl $NSYSCALL, %eax
533 533 jae _syscall_ill
534 534 shll $SYSENT_SIZE_SHIFT, %eax
535 535 leaq sysent(%rax), %rbx
536 536
537 537 call *SY_CALLC(%rbx)
538 538
539 539 movq %rax, %r12
540 540 movq %rdx, %r13
541 541
542 542 /*
543 543 * If the handler returns two ints, then we need to split the
544 544 * 64-bit return value into two 32-bit values.
545 545 */
546 546 testw $SE_32RVAL2, SY_FLAGS(%rbx)
547 547 je 5f
548 548 movq %r12, %r13
549 549 shrq $32, %r13 /* upper 32-bits into %edx */
550 550 movl %r12d, %r12d /* lower 32-bits into %eax */
551 551 5:
552 552 /*
553 553 * Optimistically assume that there's no post-syscall
554 554 * work to do. (This is to avoid having to call syscall_mstate()
555 555 * with interrupts disabled)
556 556 */
557 557 MSTATE_TRANSITION(LMS_SYSTEM, LMS_USER)
558 558
559 559 /*
560 560 * We must protect ourselves from being descheduled here;
561 561 * If we were, and we ended up on another cpu, or another
562 562 * lwp got in ahead of us, it could change the segment
563 563 * registers without us noticing before we return to userland.
564 564 */
565 565 CLI(%r14)
566 566 CHECK_POSTSYS_NE(%r15, %r14, %ebx)
567 567 jne _syscall_post
568 568 SIMPLE_SYSCALL_POSTSYS(%r15, %r14, %bx)
569 569
570 570 movq %r12, REGOFF_RAX(%rsp)
571 571 movq %r13, REGOFF_RDX(%rsp)
572 572
573 573 /*
574 574 * To get back to userland, we need the return %rip in %rcx and
575 575 * the return %rfl in %r11d. The sysretq instruction also arranges
576 576 * to fix up %cs and %ss; everything else is our responsibility.
577 577 */
578 578 movq REGOFF_RDI(%rsp), %rdi
579 579 movq REGOFF_RSI(%rsp), %rsi
580 580 movq REGOFF_RDX(%rsp), %rdx
581 581 /* %rcx used to restore %rip value */
582 582
583 583 movq REGOFF_R8(%rsp), %r8
584 584 movq REGOFF_R9(%rsp), %r9
585 585 movq REGOFF_RAX(%rsp), %rax
586 586 movq REGOFF_RBX(%rsp), %rbx
587 587
588 588 movq REGOFF_RBP(%rsp), %rbp
589 589 movq REGOFF_R10(%rsp), %r10
590 590 /* %r11 used to restore %rfl value */
591 591 movq REGOFF_R12(%rsp), %r12
592 592
593 593 movq REGOFF_R13(%rsp), %r13
594 594 movq REGOFF_R14(%rsp), %r14
595 595 movq REGOFF_R15(%rsp), %r15
596 596
597 597 movq REGOFF_RIP(%rsp), %rcx
598 598 movl REGOFF_RFL(%rsp), %r11d
599 599
600 600 #if defined(__xpv)
601 601 addq $REGOFF_RIP, %rsp
602 602 #else
603 603 movq REGOFF_RSP(%rsp), %rsp
604 604 #endif
605 605
606 606 /*
607 607 * There can be no instructions between the ALTENTRY below and
608 608 * SYSRET or we could end up breaking brand support. See label usage
609 609 * in sn1_brand_syscall_callback for an example.
610 610 */
611 611 ASSERT_UPCALL_MASK_IS_SET
612 612 #if defined(__xpv)
613 613 SYSRETQ
614 614 ALTENTRY(nopop_sys_syscall_swapgs_sysretq)
615 615
616 616 /*
617 617 * We can only get here after executing a brand syscall
618 618 * interposition callback handler and simply need to
619 619 * "sysretq" back to userland. On the hypervisor this
620 620 * involves the iret hypercall which requires us to construct
621 621 * just enough of the stack needed for the hypercall.
622 622 * (rip, cs, rflags, rsp, ss).
623 623 */
624 624 movq %rsp, %gs:CPU_RTMP_RSP /* save user's rsp */
625 625 movq %gs:CPU_THREAD, %r11
626 626 movq T_STACK(%r11), %rsp
627 627
628 628 movq %rcx, REGOFF_RIP(%rsp)
629 629 movl $UCS_SEL, REGOFF_CS(%rsp)
630 630 movq %gs:CPU_RTMP_RSP, %r11
631 631 movq %r11, REGOFF_RSP(%rsp)
632 632 pushfq
633 633 popq %r11 /* hypercall enables ints */
634 634 movq %r11, REGOFF_RFL(%rsp)
635 635 movl $UDS_SEL, REGOFF_SS(%rsp)
636 636 addq $REGOFF_RIP, %rsp
637 637 /*
638 638 * XXPV: see comment in SYSRETQ definition for future optimization
639 639 * we could take.
640 640 */
641 641 ASSERT_UPCALL_MASK_IS_SET
642 642 SYSRETQ
643 643 #else
644 644 ALTENTRY(nopop_sys_syscall_swapgs_sysretq)
645 645 SWAPGS /* user gsbase */
646 646 SYSRETQ
647 647 #endif
648 648 /*NOTREACHED*/
649 649 SET_SIZE(nopop_sys_syscall_swapgs_sysretq)
650 650
651 651 _syscall_pre:
652 652 call pre_syscall
653 653 movl %eax, %r12d
654 654 testl %eax, %eax
655 655 jne _syscall_post_call
656 656 /*
657 657 * Didn't abort, so reload the syscall args and invoke the handler.
658 658 */
659 659 movzwl T_SYSNUM(%r15), %eax
660 660 jmp _syscall_invoke
661 661
662 662 _syscall_ill:
663 663 call nosys
664 664 movq %rax, %r12
665 665 movq %rdx, %r13
666 666 jmp _syscall_post_call
667 667
668 668 _syscall_post:
669 669 STI
670 670 /*
671 671 * Sigh, our optimism wasn't justified, put it back to LMS_SYSTEM
672 672 * so that we can account for the extra work it takes us to finish.
673 673 */
674 674 MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM)
675 675 _syscall_post_call:
676 676 movq %r12, %rdi
677 677 movq %r13, %rsi
678 678 call post_syscall
679 679 MSTATE_TRANSITION(LMS_SYSTEM, LMS_USER)
680 680 jmp _sys_rtt
681 681 SET_SIZE(sys_syscall)
682 682 SET_SIZE(brand_sys_syscall)
683 683
684 684 #endif /* __lint */
685 685
686 686 #if defined(__lint)
687 687
688 688 /*ARGSUSED*/
689 689 void
690 690 sys_syscall32()
691 691 {}
692 692
693 693 #else /* __lint */
694 694
695 695 ENTRY_NP(brand_sys_syscall32)
696 696 SWAPGS /* kernel gsbase */
697 697 XPV_TRAP_POP
698 698 BRAND_CALLBACK(BRAND_CB_SYSCALL32, BRAND_URET_FROM_REG(%rcx))
699 699 jmp nopop_sys_syscall32
700 700
701 701 ALTENTRY(sys_syscall32)
702 702 SWAPGS /* kernel gsbase */
703 703 XPV_TRAP_POP
704 704
705 705 nopop_sys_syscall32:
706 706 movl %esp, %r10d
707 707 movq %gs:CPU_THREAD, %r15
708 708 movq T_STACK(%r15), %rsp
709 709 movl %eax, %eax
710 710
711 711 movl $U32CS_SEL, REGOFF_CS(%rsp)
712 712 movl %ecx, REGOFF_RIP(%rsp) /* syscall: %rip -> %rcx */
713 713 movq %r11, REGOFF_RFL(%rsp) /* syscall: %rfl -> %r11d */
714 714 movq %r10, REGOFF_RSP(%rsp)
715 715 movl $UDS_SEL, REGOFF_SS(%rsp)
716 716
717 717 _syscall32_save:
718 718 movl %edi, REGOFF_RDI(%rsp)
719 719 movl %esi, REGOFF_RSI(%rsp)
720 720 movl %ebp, REGOFF_RBP(%rsp)
721 721 movl %ebx, REGOFF_RBX(%rsp)
722 722 movl %edx, REGOFF_RDX(%rsp)
723 723 movl %ecx, REGOFF_RCX(%rsp)
724 724 movl %eax, REGOFF_RAX(%rsp) /* wrapper: sysc# -> %eax */
725 725 movq $0, REGOFF_SAVFP(%rsp)
726 726 movq $0, REGOFF_SAVPC(%rsp)
727 727
728 728 /*
729 729 * Copy these registers here in case we end up stopped with
730 730 * someone (like, say, /proc) messing with our register state.
731 731 * We don't -restore- them unless we have to in update_sregs.
732 732 *
733 733 * Since userland -can't- change fsbase or gsbase directly,
734 734 * we don't bother to capture them here.
735 735 */
736 736 xorl %ebx, %ebx
737 737 movw %ds, %bx
738 738 movq %rbx, REGOFF_DS(%rsp)
739 739 movw %es, %bx
740 740 movq %rbx, REGOFF_ES(%rsp)
741 741 movw %fs, %bx
742 742 movq %rbx, REGOFF_FS(%rsp)
743 743 movw %gs, %bx
744 744 movq %rbx, REGOFF_GS(%rsp)
745 745
746 746 /*
747 747 * Application state saved in the regs structure on the stack
748 748 * %eax is the syscall number
749 749 * %rsp is the thread's stack, %r15 is curthread
750 750 * REG_RSP(%rsp) is the user's stack
751 751 */
752 752
753 753 SYSCALL_TRAPTRACE32($TT_SYSC)
754 754
755 755 movq %rsp, %rbp
756 756
757 757 movq T_LWP(%r15), %r14
758 758 ASSERT_NO_RUPDATE_PENDING(%r14)
759 759
760 760 ENABLE_INTR_FLAGS
761 761
762 762 MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM)
763 763 movl REGOFF_RAX(%rsp), %eax /* (%rax damaged by mstate call) */
764 764
765 765 ASSERT_LWPTOREGS(%r14, %rsp)
766 766
767 767 incq %gs:CPU_STATS_SYS_SYSCALL
768 768
769 769 /*
770 770 * Make some space for MAXSYSARGS (currently 8) 32-bit args placed
771 771 * into 64-bit (long) arg slots, maintaining 16 byte alignment. Or
772 772 * more succinctly:
773 773 *
774 774 * SA(MAXSYSARGS * sizeof (long)) == 64
775 775 */
776 776 #define SYS_DROP 64 /* drop for args */
777 777 subq $SYS_DROP, %rsp
778 778 movb $LWP_SYS, LWP_STATE(%r14)
779 779 movq %r15, %rdi
780 780 movq %rsp, %rsi
781 781 call syscall_entry
782 782
783 783 /*
784 784 * Fetch the arguments copied onto the kernel stack and put
785 785 * them in the right registers to invoke a C-style syscall handler.
786 786 * %rax contains the handler address.
787 787 *
788 788 * Ideas for making all this go faster of course include simply
789 789 * forcibly fetching 6 arguments from the user stack under lofault
790 790 * protection, reverting to copyin_args only when watchpoints
791 791 * are in effect.
792 792 *
793 793 * (If we do this, make sure that exec and libthread leave
794 794 * enough space at the top of the stack to ensure that we'll
795 795 * never do a fetch from an invalid page.)
796 796 *
797 797 * Lots of ideas here, but they won't really help with bringup B-)
798 798 * Correctness can't wait, performance can wait a little longer ..
799 799 */
800 800
801 801 movq %rax, %rbx
802 802 movl 0(%rsp), %edi
803 803 movl 8(%rsp), %esi
804 804 movl 0x10(%rsp), %edx
805 805 movl 0x18(%rsp), %ecx
806 806 movl 0x20(%rsp), %r8d
807 807 movl 0x28(%rsp), %r9d
808 808
809 809 call *SY_CALLC(%rbx)
810 810
811 811 movq %rbp, %rsp /* pop the args */
812 812
813 813 /*
814 814 * amd64 syscall handlers -always- return a 64-bit value in %rax.
815 815 * On the 32-bit kernel, they always return that value in %eax:%edx
816 816 * as required by the 32-bit ABI.
817 817 *
818 818 * Simulate the same behaviour by unconditionally splitting the
819 819 * return value in the same way.
820 820 */
821 821 movq %rax, %r13
822 822 shrq $32, %r13 /* upper 32-bits into %edx */
823 823 movl %eax, %r12d /* lower 32-bits into %eax */
824 824
825 825 /*
826 826 * Optimistically assume that there's no post-syscall
827 827 * work to do. (This is to avoid having to call syscall_mstate()
828 828 * with interrupts disabled)
829 829 */
830 830 MSTATE_TRANSITION(LMS_SYSTEM, LMS_USER)
831 831
832 832 /*
833 833 * We must protect ourselves from being descheduled here;
834 834 * If we were, and we ended up on another cpu, or another
835 835 * lwp got in ahead of us, it could change the segment
836 836 * registers without us noticing before we return to userland.
837 837 */
838 838 CLI(%r14)
839 839 CHECK_POSTSYS_NE(%r15, %r14, %ebx)
840 840 jne _full_syscall_postsys32
841 841 SIMPLE_SYSCALL_POSTSYS(%r15, %r14, %bx)
842 842
843 843 /*
844 844 * To get back to userland, we need to put the return %rip in %rcx and
845 845 * the return %rfl in %r11d. The sysret instruction also arranges
846 846 * to fix up %cs and %ss; everything else is our responsibility.
847 847 */
848 848
849 849 movl %r12d, %eax /* %eax: rval1 */
850 850 movl REGOFF_RBX(%rsp), %ebx
851 851 /* %ecx used for return pointer */
852 852 movl %r13d, %edx /* %edx: rval2 */
853 853 movl REGOFF_RBP(%rsp), %ebp
854 854 movl REGOFF_RSI(%rsp), %esi
855 855 movl REGOFF_RDI(%rsp), %edi
856 856
857 857 movl REGOFF_RFL(%rsp), %r11d /* %r11 -> eflags */
858 858 movl REGOFF_RIP(%rsp), %ecx /* %ecx -> %eip */
859 859 movl REGOFF_RSP(%rsp), %esp
860 860
861 861 ASSERT_UPCALL_MASK_IS_SET
862 862 ALTENTRY(nopop_sys_syscall32_swapgs_sysretl)
863 863 SWAPGS /* user gsbase */
864 864 SYSRETL
865 865 SET_SIZE(nopop_sys_syscall32_swapgs_sysretl)
866 866 /*NOTREACHED*/
867 867
868 868 _full_syscall_postsys32:
869 869 STI
870 870 /*
871 871 * Sigh, our optimism wasn't justified, put it back to LMS_SYSTEM
872 872 * so that we can account for the extra work it takes us to finish.
873 873 */
874 874 MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM)
875 875 movq %r15, %rdi
876 876 movq %r12, %rsi /* rval1 - %eax */
877 877 movq %r13, %rdx /* rval2 - %edx */
878 878 call syscall_exit
879 879 MSTATE_TRANSITION(LMS_SYSTEM, LMS_USER)
880 880 jmp _sys_rtt
881 881 SET_SIZE(sys_syscall32)
882 882 SET_SIZE(brand_sys_syscall32)
883 883
884 884 #endif /* __lint */
885 885
886 886 /*
887 887 * System call handler via the sysenter instruction
888 888 * Used only for 32-bit system calls on the 64-bit kernel.
889 889 *
890 890 * The caller in userland has arranged that:
891 891 *
892 892 * - %eax contains the syscall number
893 893 * - %ecx contains the user %esp
894 894 * - %edx contains the return %eip
895 895 * - the user stack contains the args to the syscall
896 896 *
897 897 * Hardware and (privileged) initialization code have arranged that by
898 898 * the time the sysenter instructions completes:
899 899 *
900 900 * - %rip is pointing to sys_sysenter (below).
901 901 * - %cs and %ss are set to kernel text and stack (data) selectors.
902 902 * - %rsp is pointing at the lwp's stack
903 903 * - interrupts have been disabled.
904 904 *
905 905 * Note that we are unable to return both "rvals" to userland with
906 906 * this call, as %edx is used by the sysexit instruction.
907 907 *
908 908 * One final complication in this routine is its interaction with
909 909 * single-stepping in a debugger. For most of the system call mechanisms,
910 910 * the CPU automatically clears the single-step flag before we enter the
911 911 * kernel. The sysenter mechanism does not clear the flag, so a user
912 912 * single-stepping through a libc routine may suddenly find him/herself
913 913 * single-stepping through the kernel. To detect this, kmdb compares the
914 914 * trap %pc to the [brand_]sys_enter addresses on each single-step trap.
915 915 * If it finds that we have single-stepped to a sysenter entry point, it
916 916 * explicitly clears the flag and executes the sys_sysenter routine.
917 917 *
918 918 * One final complication in this final complication is the fact that we
919 919 * have two different entry points for sysenter: brand_sys_sysenter and
920 920 * sys_sysenter. If we enter at brand_sys_sysenter and start single-stepping
921 921 * through the kernel with kmdb, we will eventually hit the instruction at
922 922 * sys_sysenter. kmdb cannot distinguish between that valid single-step
923 923 * and the undesirable one mentioned above. To avoid this situation, we
924 924 * simply add a jump over the instruction at sys_sysenter to make it
925 925 * impossible to single-step to it.
926 926 */
927 927 #if defined(__lint)
928 928
929 929 void
930 930 sys_sysenter()
931 931 {}
932 932
933 933 #else /* __lint */
934 934
935 935 ENTRY_NP(brand_sys_sysenter)
936 936 SWAPGS /* kernel gsbase */
937 937 ALTENTRY(_brand_sys_sysenter_post_swapgs)
938 938 BRAND_CALLBACK(BRAND_CB_SYSENTER, BRAND_URET_FROM_REG(%rdx))
939 939 /*
940 940 * Jump over sys_sysenter to allow single-stepping as described
941 941 * above.
942 942 */
943 943 jmp _sys_sysenter_post_swapgs
944 944
945 945 ALTENTRY(sys_sysenter)
946 946 SWAPGS /* kernel gsbase */
947 947
948 948 ALTENTRY(_sys_sysenter_post_swapgs)
949 949 movq %gs:CPU_THREAD, %r15
950 950
951 951 movl $U32CS_SEL, REGOFF_CS(%rsp)
952 952 movl %ecx, REGOFF_RSP(%rsp) /* wrapper: %esp -> %ecx */
953 953 movl %edx, REGOFF_RIP(%rsp) /* wrapper: %eip -> %edx */
954 954 pushfq
955 955 popq %r10
956 956 movl $UDS_SEL, REGOFF_SS(%rsp)
957 957
958 958 /*
959 959 * Set the interrupt flag before storing the flags to the
960 960 * flags image on the stack so we can return to user with
961 961 * interrupts enabled if we return via sys_rtt_syscall32
962 962 */
963 963 orq $PS_IE, %r10
964 964 movq %r10, REGOFF_RFL(%rsp)
965 965
966 966 movl %edi, REGOFF_RDI(%rsp)
967 967 movl %esi, REGOFF_RSI(%rsp)
968 968 movl %ebp, REGOFF_RBP(%rsp)
969 969 movl %ebx, REGOFF_RBX(%rsp)
970 970 movl %edx, REGOFF_RDX(%rsp)
971 971 movl %ecx, REGOFF_RCX(%rsp)
972 972 movl %eax, REGOFF_RAX(%rsp) /* wrapper: sysc# -> %eax */
973 973 movq $0, REGOFF_SAVFP(%rsp)
974 974 movq $0, REGOFF_SAVPC(%rsp)
975 975
976 976 /*
977 977 * Copy these registers here in case we end up stopped with
978 978 * someone (like, say, /proc) messing with our register state.
979 979 * We don't -restore- them unless we have to in update_sregs.
980 980 *
981 981 * Since userland -can't- change fsbase or gsbase directly,
982 982 * we don't bother to capture them here.
983 983 */
984 984 xorl %ebx, %ebx
985 985 movw %ds, %bx
986 986 movq %rbx, REGOFF_DS(%rsp)
987 987 movw %es, %bx
988 988 movq %rbx, REGOFF_ES(%rsp)
989 989 movw %fs, %bx
990 990 movq %rbx, REGOFF_FS(%rsp)
991 991 movw %gs, %bx
992 992 movq %rbx, REGOFF_GS(%rsp)
993 993
994 994 /*
995 995 * Application state saved in the regs structure on the stack
996 996 * %eax is the syscall number
997 997 * %rsp is the thread's stack, %r15 is curthread
998 998 * REG_RSP(%rsp) is the user's stack
999 999 */
1000 1000
1001 1001 SYSCALL_TRAPTRACE($TT_SYSENTER)
1002 1002
1003 1003 movq %rsp, %rbp
1004 1004
1005 1005 movq T_LWP(%r15), %r14
1006 1006 ASSERT_NO_RUPDATE_PENDING(%r14)
1007 1007
1008 1008 ENABLE_INTR_FLAGS
1009 1009
1010 1010 /*
1011 1011 * Catch 64-bit process trying to issue sysenter instruction
1012 1012 * on Nocona based systems.
1013 1013 */
1014 1014 movq LWP_PROCP(%r14), %rax
1015 1015 cmpq $DATAMODEL_ILP32, P_MODEL(%rax)
1016 1016 je 7f
1017 1017
1018 1018 /*
1019 1019 * For a non-32-bit process, simulate a #ud, since that's what
1020 1020 * native hardware does. The traptrace entry (above) will
1021 1021 * let you know what really happened.
1022 1022 */
1023 1023 movq $T_ILLINST, REGOFF_TRAPNO(%rsp)
1024 1024 movq REGOFF_CS(%rsp), %rdi
1025 1025 movq %rdi, REGOFF_ERR(%rsp)
1026 1026 movq %rsp, %rdi
1027 1027 movq REGOFF_RIP(%rsp), %rsi
1028 1028 movl %gs:CPU_ID, %edx
1029 1029 call trap
1030 1030 jmp _sys_rtt
1031 1031 7:
1032 1032
1033 1033 MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM)
1034 1034 movl REGOFF_RAX(%rsp), %eax /* (%rax damaged by mstate calls) */
1035 1035
1036 1036 ASSERT_LWPTOREGS(%r14, %rsp)
1037 1037
1038 1038 incq %gs:CPU_STATS_SYS_SYSCALL
1039 1039
1040 1040 /*
1041 1041 * Make some space for MAXSYSARGS (currently 8) 32-bit args
1042 1042 * placed into 64-bit (long) arg slots, plus one 64-bit
1043 1043 * (long) arg count, maintaining 16 byte alignment.
1044 1044 */
1045 1045 subq $SYS_DROP, %rsp
1046 1046 movb $LWP_SYS, LWP_STATE(%r14)
1047 1047 movq %r15, %rdi
1048 1048 movq %rsp, %rsi
1049 1049 call syscall_entry
1050 1050
1051 1051 /*
1052 1052 * Fetch the arguments copied onto the kernel stack and put
1053 1053 * them in the right registers to invoke a C-style syscall handler.
1054 1054 * %rax contains the handler address.
1055 1055 */
1056 1056 movq %rax, %rbx
1057 1057 movl 0(%rsp), %edi
1058 1058 movl 8(%rsp), %esi
1059 1059 movl 0x10(%rsp), %edx
1060 1060 movl 0x18(%rsp), %ecx
1061 1061 movl 0x20(%rsp), %r8d
1062 1062 movl 0x28(%rsp), %r9d
1063 1063
1064 1064 call *SY_CALLC(%rbx)
1065 1065
1066 1066 movq %rbp, %rsp /* pop the args */
1067 1067
1068 1068 /*
1069 1069 * amd64 syscall handlers -always- return a 64-bit value in %rax.
1070 1070 * On the 32-bit kernel, the always return that value in %eax:%edx
1071 1071 * as required by the 32-bit ABI.
1072 1072 *
1073 1073 * Simulate the same behaviour by unconditionally splitting the
1074 1074 * return value in the same way.
1075 1075 */
1076 1076 movq %rax, %r13
1077 1077 shrq $32, %r13 /* upper 32-bits into %edx */
1078 1078 movl %eax, %r12d /* lower 32-bits into %eax */
1079 1079
1080 1080 /*
1081 1081 * Optimistically assume that there's no post-syscall
1082 1082 * work to do. (This is to avoid having to call syscall_mstate()
1083 1083 * with interrupts disabled)
1084 1084 */
1085 1085 MSTATE_TRANSITION(LMS_SYSTEM, LMS_USER)
1086 1086
1087 1087 /*
1088 1088 * We must protect ourselves from being descheduled here;
1089 1089 * If we were, and we ended up on another cpu, or another
1090 1090 * lwp got int ahead of us, it could change the segment
1091 1091 * registers without us noticing before we return to userland.
1092 1092 */
1093 1093 cli
1094 1094 CHECK_POSTSYS_NE(%r15, %r14, %ebx)
1095 1095 jne _full_syscall_postsys32
1096 1096 SIMPLE_SYSCALL_POSTSYS(%r15, %r14, %bx)
1097 1097
1098 1098 /*
1099 1099 * To get back to userland, load up the 32-bit registers and
1100 1100 * sysexit back where we came from.
1101 1101 */
1102 1102
1103 1103 /*
1104 1104 * Interrupts will be turned on by the 'sti' executed just before
1105 1105 * sysexit. The following ensures that restoring the user's rflags
1106 1106 * doesn't enable interrupts too soon.
1107 1107 */
1108 1108 andq $_BITNOT(PS_IE), REGOFF_RFL(%rsp)
1109 1109
1110 1110 /*
1111 1111 * (There's no point in loading up %edx because the sysexit
1112 1112 * mechanism smashes it.)
1113 1113 */
1114 1114 movl %r12d, %eax
1115 1115 movl REGOFF_RBX(%rsp), %ebx
1116 1116 movl REGOFF_RBP(%rsp), %ebp
1117 1117 movl REGOFF_RSI(%rsp), %esi
1118 1118 movl REGOFF_RDI(%rsp), %edi
1119 1119
1120 1120 movl REGOFF_RIP(%rsp), %edx /* sysexit: %edx -> %eip */
1121 1121 pushq REGOFF_RFL(%rsp)
1122 1122 popfq
1123 1123 movl REGOFF_RSP(%rsp), %ecx /* sysexit: %ecx -> %esp */
1124 1124 ALTENTRY(sys_sysenter_swapgs_sysexit)
↓ open down ↓ |
1124 lines elided |
↑ open up ↑ |
1125 1125 swapgs
1126 1126 sti
1127 1127 sysexit
1128 1128 SET_SIZE(sys_sysenter_swapgs_sysexit)
1129 1129 SET_SIZE(sys_sysenter)
1130 1130 SET_SIZE(_sys_sysenter_post_swapgs)
1131 1131 SET_SIZE(brand_sys_sysenter)
1132 1132
1133 1133 #endif /* __lint */
1134 1134
1135 +#if defined(__lint)
1135 1136 /*
1137 + * System call via an int80. This entry point is only used by the Linux
1138 + * application environment. Unlike the other entry points, there is no
1139 + * default action to take if no callback is registered for this process.
1140 + */
1141 +void
1142 +sys_int80()
1143 +{}
1144 +
1145 +#else /* __lint */
1146 +
1147 + ENTRY_NP(brand_sys_int80)
1148 + SWAPGS /* kernel gsbase */
1149 + XPV_TRAP_POP
1150 + BRAND_CALLBACK(BRAND_CB_INT80, BRAND_URET_FROM_INTR_STACK())
1151 + SWAPGS /* user gsbase */
1152 + jmp nopop_int80
1153 +
1154 + ENTRY_NP(sys_int80)
1155 + /*
1156 + * We hit an int80, but this process isn't of a brand with an int80
1157 + * handler. Bad process! Make it look as if the INT failed.
1158 + * Modify %rip to point before the INT, push the expected error
1159 + * code and fake a GP fault. Note on 64-bit hypervisor we need
1160 + * to undo the XPV_TRAP_POP and push rcx and r11 back on the stack
1161 + * because gptrap will pop them again with its own XPV_TRAP_POP.
1162 + */
1163 + XPV_TRAP_POP
1164 +nopop_int80:
1165 + subq $2, (%rsp) /* int insn 2-bytes */
1166 + pushq $_CONST(_MUL(T_INT80, GATE_DESC_SIZE) + 2)
1167 +#if defined(__xpv)
1168 + push %r11
1169 + push %rcx
1170 +#endif
1171 + jmp gptrap / GP fault
1172 + SET_SIZE(sys_int80)
1173 + SET_SIZE(brand_sys_int80)
1174 +#endif /* __lint */
1175 +
1176 +
1177 +/*
1136 1178 * This is the destination of the "int $T_SYSCALLINT" interrupt gate, used by
1137 1179 * the generic i386 libc to do system calls. We do a small amount of setup
1138 1180 * before jumping into the existing sys_syscall32 path.
1139 1181 */
1140 1182 #if defined(__lint)
1141 1183
1142 1184 /*ARGSUSED*/
1143 1185 void
1144 1186 sys_syscall_int()
1145 1187 {}
1146 1188
1147 1189 #else /* __lint */
1148 1190
1149 1191 ENTRY_NP(brand_sys_syscall_int)
1150 1192 SWAPGS /* kernel gsbase */
1151 1193 XPV_TRAP_POP
1152 1194 BRAND_CALLBACK(BRAND_CB_INT91, BRAND_URET_FROM_INTR_STACK())
1153 1195 jmp nopop_syscall_int
1154 1196
1155 1197 ALTENTRY(sys_syscall_int)
1156 1198 SWAPGS /* kernel gsbase */
1157 1199 XPV_TRAP_POP
1158 1200
1159 1201 nopop_syscall_int:
1160 1202 movq %gs:CPU_THREAD, %r15
1161 1203 movq T_STACK(%r15), %rsp
1162 1204 movl %eax, %eax
1163 1205 /*
1164 1206 * Set t_post_sys on this thread to force ourselves out via the slow
1165 1207 * path. It might be possible at some later date to optimize this out
1166 1208 * and use a faster return mechanism.
1167 1209 */
1168 1210 movb $1, T_POST_SYS(%r15)
1169 1211 CLEAN_CS
1170 1212 jmp _syscall32_save
1171 1213 /*
1172 1214 * There should be no instructions between this label and SWAPGS/IRET
1173 1215 * or we could end up breaking branded zone support. See the usage of
1174 1216 * this label in lx_brand_int80_callback and sn1_brand_int91_callback
1175 1217 * for examples.
1176 1218 */
1177 1219 ALTENTRY(sys_sysint_swapgs_iret)
1178 1220 SWAPGS /* user gsbase */
1179 1221 IRET
1180 1222 /*NOTREACHED*/
1181 1223 SET_SIZE(sys_sysint_swapgs_iret)
1182 1224 SET_SIZE(sys_syscall_int)
1183 1225 SET_SIZE(brand_sys_syscall_int)
1184 1226
1185 1227 #endif /* __lint */
1186 1228
1187 1229 /*
1188 1230 * Legacy 32-bit applications and old libc implementations do lcalls;
1189 1231 * we should never get here because the LDT entry containing the syscall
1190 1232 * segment descriptor has the "segment present" bit cleared, which means
1191 1233 * we end up processing those system calls in trap() via a not-present trap.
1192 1234 *
1193 1235 * We do it this way because a call gate unhelpfully does -nothing- to the
1194 1236 * interrupt flag bit, so an interrupt can run us just after the lcall
1195 1237 * completes, but just before the swapgs takes effect. Thus the INTR_PUSH and
1196 1238 * INTR_POP paths would have to be slightly more complex to dance around
1197 1239 * this problem, and end up depending explicitly on the first
1198 1240 * instruction of this handler being either swapgs or cli.
1199 1241 */
1200 1242
1201 1243 #if defined(__lint)
1202 1244
1203 1245 /*ARGSUSED*/
1204 1246 void
1205 1247 sys_lcall32()
1206 1248 {}
1207 1249
1208 1250 #else /* __lint */
1209 1251
1210 1252 ENTRY_NP(sys_lcall32)
1211 1253 SWAPGS /* kernel gsbase */
1212 1254 pushq $0
1213 1255 pushq %rbp
1214 1256 movq %rsp, %rbp
1215 1257 leaq __lcall_panic_str(%rip), %rdi
1216 1258 xorl %eax, %eax
1217 1259 call panic
1218 1260 SET_SIZE(sys_lcall32)
1219 1261
1220 1262 __lcall_panic_str:
1221 1263 .string "sys_lcall32: shouldn't be here!"
1222 1264
1223 1265 /*
1224 1266 * Declare a uintptr_t which covers the entire pc range of syscall
1225 1267 * handlers for the stack walkers that need this.
1226 1268 */
1227 1269 .align CPTRSIZE
1228 1270 .globl _allsyscalls_size
1229 1271 .type _allsyscalls_size, @object
1230 1272 _allsyscalls_size:
1231 1273 .NWORD . - _allsyscalls
1232 1274 SET_SIZE(_allsyscalls_size)
1233 1275
1234 1276 #endif /* __lint */
1235 1277
1236 1278 /*
1237 1279 * These are the thread context handlers for lwps using sysenter/sysexit.
1238 1280 */
1239 1281
1240 1282 #if defined(__lint)
1241 1283
1242 1284 /*ARGSUSED*/
1243 1285 void
1244 1286 sep_save(void *ksp)
1245 1287 {}
1246 1288
1247 1289 /*ARGSUSED*/
1248 1290 void
1249 1291 sep_restore(void *ksp)
1250 1292 {}
1251 1293
1252 1294 #else /* __lint */
1253 1295
1254 1296 /*
1255 1297 * setting this value to zero as we switch away causes the
1256 1298 * stack-pointer-on-sysenter to be NULL, ensuring that we
1257 1299 * don't silently corrupt another (preempted) thread stack
1258 1300 * when running an lwp that (somehow) didn't get sep_restore'd
1259 1301 */
1260 1302 ENTRY_NP(sep_save)
1261 1303 xorl %edx, %edx
1262 1304 xorl %eax, %eax
1263 1305 movl $MSR_INTC_SEP_ESP, %ecx
1264 1306 wrmsr
1265 1307 ret
1266 1308 SET_SIZE(sep_save)
1267 1309
1268 1310 /*
1269 1311 * Update the kernel stack pointer as we resume onto this cpu.
1270 1312 */
1271 1313 ENTRY_NP(sep_restore)
1272 1314 movq %rdi, %rdx
1273 1315 shrq $32, %rdx
1274 1316 movl %edi, %eax
1275 1317 movl $MSR_INTC_SEP_ESP, %ecx
1276 1318 wrmsr
1277 1319 ret
1278 1320 SET_SIZE(sep_restore)
1279 1321
1280 1322 #endif /* __lint */
↓ open down ↓ |
135 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX