diff options
author | Linus Torvalds <torvalds@ppc970.osdl.org> | 2005-04-16 15:20:36 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@ppc970.osdl.org> | 2005-04-16 15:20:36 -0700 |
commit | 1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch) | |
tree | 0bba044c4ce775e45a88a51686b5d9f90697ea9d /arch/ia64/kernel |
Linux-2.6.12-rc2
Initial git repository build. I'm not bothering with the full history,
even though we have it. We can create a separate "historical" git
archive of that later if we want to, and in the meantime it's about
3.2GB when imported into git - space that would just make the early
git days unnecessarily complicated, when we don't have a lot of good
infrastructure for it.
Let it rip!
Diffstat (limited to 'arch/ia64/kernel')
58 files changed, 35362 insertions, 0 deletions
diff --git a/arch/ia64/kernel/Makefile b/arch/ia64/kernel/Makefile new file mode 100644 index 000000000000..c1a02bbc252c --- /dev/null +++ b/arch/ia64/kernel/Makefile @@ -0,0 +1,52 @@ +# +# Makefile for the linux kernel. +# + +extra-y := head.o init_task.o vmlinux.lds + +obj-y := acpi.o entry.o efi.o efi_stub.o gate-data.o fsys.o ia64_ksyms.o irq.o irq_ia64.o \ + irq_lsapic.o ivt.o machvec.o pal.o patch.o process.o perfmon.o ptrace.o sal.o \ + salinfo.o semaphore.o setup.o signal.o sys_ia64.o time.o traps.o unaligned.o \ + unwind.o mca.o mca_asm.o topology.o + +obj-$(CONFIG_IA64_BRL_EMU) += brl_emu.o +obj-$(CONFIG_IA64_GENERIC) += acpi-ext.o +obj-$(CONFIG_IA64_HP_ZX1) += acpi-ext.o +obj-$(CONFIG_IA64_HP_ZX1_SWIOTLB) += acpi-ext.o +obj-$(CONFIG_IA64_PALINFO) += palinfo.o +obj-$(CONFIG_IOSAPIC) += iosapic.o +obj-$(CONFIG_MODULES) += module.o +obj-$(CONFIG_SMP) += smp.o smpboot.o domain.o +obj-$(CONFIG_PERFMON) += perfmon_default_smpl.o +obj-$(CONFIG_IA64_CYCLONE) += cyclone.o +obj-$(CONFIG_IA64_MCA_RECOVERY) += mca_recovery.o +mca_recovery-y += mca_drv.o mca_drv_asm.o + +# The gate DSO image is built using a special linker script. +targets += gate.so gate-syms.o + +extra-y += gate.so gate-syms.o gate.lds gate.o + +# fp_emulate() expects f2-f5,f16-f31 to contain the user-level state. +CFLAGS_traps.o += -mfixed-range=f2-f5,f16-f31 + +CPPFLAGS_gate.lds := -P -C -U$(ARCH) + +quiet_cmd_gate = GATE $@ + cmd_gate = $(CC) -nostdlib $(GATECFLAGS_$(@F)) -Wl,-T,$(filter-out FORCE,$^) -o $@ + +GATECFLAGS_gate.so = -shared -s -Wl,-soname=linux-gate.so.1 +$(obj)/gate.so: $(obj)/gate.lds $(obj)/gate.o FORCE + $(call if_changed,gate) + +$(obj)/built-in.o: $(obj)/gate-syms.o +$(obj)/built-in.o: ld_flags += -R $(obj)/gate-syms.o + +GATECFLAGS_gate-syms.o = -r +$(obj)/gate-syms.o: $(obj)/gate.lds $(obj)/gate.o FORCE + $(call if_changed,gate) + +# gate-data.o contains the gate DSO image as data in section .data.gate. +# We must build gate.so before we can assemble it. +# Note: kbuild does not track this dependency due to usage of .incbin +$(obj)/gate-data.o: $(obj)/gate.so diff --git a/arch/ia64/kernel/acpi-ext.c b/arch/ia64/kernel/acpi-ext.c new file mode 100644 index 000000000000..2623df5e2633 --- /dev/null +++ b/arch/ia64/kernel/acpi-ext.c @@ -0,0 +1,100 @@ +/* + * arch/ia64/kernel/acpi-ext.c + * + * Copyright (C) 2003 Hewlett-Packard + * Copyright (C) Alex Williamson + * Copyright (C) Bjorn Helgaas + * + * Vendor specific extensions to ACPI. + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/acpi.h> +#include <linux/efi.h> + +#include <asm/acpi-ext.h> + +struct acpi_vendor_descriptor { + u8 guid_id; + efi_guid_t guid; +}; + +struct acpi_vendor_info { + struct acpi_vendor_descriptor *descriptor; + u8 *data; + u32 length; +}; + +acpi_status +acpi_vendor_resource_match(struct acpi_resource *resource, void *context) +{ + struct acpi_vendor_info *info = (struct acpi_vendor_info *) context; + struct acpi_resource_vendor *vendor; + struct acpi_vendor_descriptor *descriptor; + u32 length; + + if (resource->id != ACPI_RSTYPE_VENDOR) + return AE_OK; + + vendor = (struct acpi_resource_vendor *) &resource->data; + descriptor = (struct acpi_vendor_descriptor *) vendor->reserved; + if (vendor->length <= sizeof(*info->descriptor) || + descriptor->guid_id != info->descriptor->guid_id || + efi_guidcmp(descriptor->guid, info->descriptor->guid)) + return AE_OK; + + length = vendor->length - sizeof(struct acpi_vendor_descriptor); + info->data = acpi_os_allocate(length); + if (!info->data) + return AE_NO_MEMORY; + + memcpy(info->data, vendor->reserved + sizeof(struct acpi_vendor_descriptor), length); + info->length = length; + return AE_CTRL_TERMINATE; +} + +acpi_status +acpi_find_vendor_resource(acpi_handle obj, struct acpi_vendor_descriptor *id, + u8 **data, u32 *length) +{ + struct acpi_vendor_info info; + + info.descriptor = id; + info.data = NULL; + + acpi_walk_resources(obj, METHOD_NAME__CRS, acpi_vendor_resource_match, &info); + if (!info.data) + return AE_NOT_FOUND; + + *data = info.data; + *length = info.length; + return AE_OK; +} + +struct acpi_vendor_descriptor hp_ccsr_descriptor = { + .guid_id = 2, + .guid = EFI_GUID(0x69e9adf9, 0x924f, 0xab5f, 0xf6, 0x4a, 0x24, 0xd2, 0x01, 0x37, 0x0e, 0xad) +}; + +acpi_status +hp_acpi_csr_space(acpi_handle obj, u64 *csr_base, u64 *csr_length) +{ + acpi_status status; + u8 *data; + u32 length; + + status = acpi_find_vendor_resource(obj, &hp_ccsr_descriptor, &data, &length); + + if (ACPI_FAILURE(status) || length != 16) + return AE_NOT_FOUND; + + memcpy(csr_base, data, sizeof(*csr_base)); + memcpy(csr_length, data + 8, sizeof(*csr_length)); + acpi_os_free(data); + + return AE_OK; +} + +EXPORT_SYMBOL(hp_acpi_csr_space); diff --git a/arch/ia64/kernel/acpi.c b/arch/ia64/kernel/acpi.c new file mode 100644 index 000000000000..a8e99c56a768 --- /dev/null +++ b/arch/ia64/kernel/acpi.c @@ -0,0 +1,841 @@ +/* + * acpi.c - Architecture-Specific Low-Level ACPI Support + * + * Copyright (C) 1999 VA Linux Systems + * Copyright (C) 1999,2000 Walt Drummond <drummond@valinux.com> + * Copyright (C) 2000, 2002-2003 Hewlett-Packard Co. + * David Mosberger-Tang <davidm@hpl.hp.com> + * Copyright (C) 2000 Intel Corp. + * Copyright (C) 2000,2001 J.I. Lee <jung-ik.lee@intel.com> + * Copyright (C) 2001 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com> + * Copyright (C) 2001 Jenna Hall <jenna.s.hall@intel.com> + * Copyright (C) 2001 Takayoshi Kochi <t-kochi@bq.jp.nec.com> + * Copyright (C) 2002 Erich Focht <efocht@ess.nec.de> + * + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/smp.h> +#include <linux/string.h> +#include <linux/types.h> +#include <linux/irq.h> +#include <linux/acpi.h> +#include <linux/efi.h> +#include <linux/mmzone.h> +#include <linux/nodemask.h> +#include <asm/io.h> +#include <asm/iosapic.h> +#include <asm/machvec.h> +#include <asm/page.h> +#include <asm/system.h> +#include <asm/numa.h> +#include <asm/sal.h> +#include <asm/cyclone.h> + +#define BAD_MADT_ENTRY(entry, end) ( \ + (!entry) || (unsigned long)entry + sizeof(*entry) > end || \ + ((acpi_table_entry_header *)entry)->length != sizeof(*entry)) + +#define PREFIX "ACPI: " + +void (*pm_idle) (void); +EXPORT_SYMBOL(pm_idle); +void (*pm_power_off) (void); +EXPORT_SYMBOL(pm_power_off); + +unsigned char acpi_kbd_controller_present = 1; +unsigned char acpi_legacy_devices; + +#define MAX_SAPICS 256 +u16 ia64_acpiid_to_sapicid[MAX_SAPICS] = + { [0 ... MAX_SAPICS - 1] = -1 }; +EXPORT_SYMBOL(ia64_acpiid_to_sapicid); + +const char * +acpi_get_sysname (void) +{ +#ifdef CONFIG_IA64_GENERIC + unsigned long rsdp_phys; + struct acpi20_table_rsdp *rsdp; + struct acpi_table_xsdt *xsdt; + struct acpi_table_header *hdr; + + rsdp_phys = acpi_find_rsdp(); + if (!rsdp_phys) { + printk(KERN_ERR "ACPI 2.0 RSDP not found, default to \"dig\"\n"); + return "dig"; + } + + rsdp = (struct acpi20_table_rsdp *) __va(rsdp_phys); + if (strncmp(rsdp->signature, RSDP_SIG, sizeof(RSDP_SIG) - 1)) { + printk(KERN_ERR "ACPI 2.0 RSDP signature incorrect, default to \"dig\"\n"); + return "dig"; + } + + xsdt = (struct acpi_table_xsdt *) __va(rsdp->xsdt_address); + hdr = &xsdt->header; + if (strncmp(hdr->signature, XSDT_SIG, sizeof(XSDT_SIG) - 1)) { + printk(KERN_ERR "ACPI 2.0 XSDT signature incorrect, default to \"dig\"\n"); + return "dig"; + } + + if (!strcmp(hdr->oem_id, "HP")) { + return "hpzx1"; + } + else if (!strcmp(hdr->oem_id, "SGI")) { + return "sn2"; + } + + return "dig"; +#else +# if defined (CONFIG_IA64_HP_SIM) + return "hpsim"; +# elif defined (CONFIG_IA64_HP_ZX1) + return "hpzx1"; +# elif defined (CONFIG_IA64_HP_ZX1_SWIOTLB) + return "hpzx1_swiotlb"; +# elif defined (CONFIG_IA64_SGI_SN2) + return "sn2"; +# elif defined (CONFIG_IA64_DIG) + return "dig"; +# else +# error Unknown platform. Fix acpi.c. +# endif +#endif +} + +#ifdef CONFIG_ACPI_BOOT + +#define ACPI_MAX_PLATFORM_INTERRUPTS 256 + +/* Array to record platform interrupt vectors for generic interrupt routing. */ +int platform_intr_list[ACPI_MAX_PLATFORM_INTERRUPTS] = { + [0 ... ACPI_MAX_PLATFORM_INTERRUPTS - 1] = -1 +}; + +enum acpi_irq_model_id acpi_irq_model = ACPI_IRQ_MODEL_IOSAPIC; + +/* + * Interrupt routing API for device drivers. Provides interrupt vector for + * a generic platform event. Currently only CPEI is implemented. + */ +int +acpi_request_vector (u32 int_type) +{ + int vector = -1; + + if (int_type < ACPI_MAX_PLATFORM_INTERRUPTS) { + /* corrected platform error interrupt */ + vector = platform_intr_list[int_type]; + } else + printk(KERN_ERR "acpi_request_vector(): invalid interrupt type\n"); + return vector; +} + +char * +__acpi_map_table (unsigned long phys_addr, unsigned long size) +{ + return __va(phys_addr); +} + +/* -------------------------------------------------------------------------- + Boot-time Table Parsing + -------------------------------------------------------------------------- */ + +static int total_cpus __initdata; +static int available_cpus __initdata; +struct acpi_table_madt * acpi_madt __initdata; +static u8 has_8259; + + +static int __init +acpi_parse_lapic_addr_ovr ( + acpi_table_entry_header *header, const unsigned long end) +{ + struct acpi_table_lapic_addr_ovr *lapic; + + lapic = (struct acpi_table_lapic_addr_ovr *) header; + + if (BAD_MADT_ENTRY(lapic, end)) + return -EINVAL; + + if (lapic->address) { + iounmap(ipi_base_addr); + ipi_base_addr = ioremap(lapic->address, 0); + } + return 0; +} + + +static int __init +acpi_parse_lsapic (acpi_table_entry_header *header, const unsigned long end) +{ + struct acpi_table_lsapic *lsapic; + + lsapic = (struct acpi_table_lsapic *) header; + + if (BAD_MADT_ENTRY(lsapic, end)) + return -EINVAL; + + if (lsapic->flags.enabled) { +#ifdef CONFIG_SMP + smp_boot_data.cpu_phys_id[available_cpus] = (lsapic->id << 8) | lsapic->eid; +#endif + ia64_acpiid_to_sapicid[lsapic->acpi_id] = (lsapic->id << 8) | lsapic->eid; + ++available_cpus; + } + + total_cpus++; + return 0; +} + + +static int __init +acpi_parse_lapic_nmi (acpi_table_entry_header *header, const unsigned long end) +{ + struct acpi_table_lapic_nmi *lacpi_nmi; + + lacpi_nmi = (struct acpi_table_lapic_nmi*) header; + + if (BAD_MADT_ENTRY(lacpi_nmi, end)) + return -EINVAL; + + /* TBD: Support lapic_nmi entries */ + return 0; +} + + +static int __init +acpi_parse_iosapic (acpi_table_entry_header *header, const unsigned long end) +{ + struct acpi_table_iosapic *iosapic; + + iosapic = (struct acpi_table_iosapic *) header; + + if (BAD_MADT_ENTRY(iosapic, end)) + return -EINVAL; + + iosapic_init(iosapic->address, iosapic->global_irq_base); + + return 0; +} + + +static int __init +acpi_parse_plat_int_src ( + acpi_table_entry_header *header, const unsigned long end) +{ + struct acpi_table_plat_int_src *plintsrc; + int vector; + + plintsrc = (struct acpi_table_plat_int_src *) header; + + if (BAD_MADT_ENTRY(plintsrc, end)) + return -EINVAL; + + /* + * Get vector assignment for this interrupt, set attributes, + * and program the IOSAPIC routing table. + */ + vector = iosapic_register_platform_intr(plintsrc->type, + plintsrc->global_irq, + plintsrc->iosapic_vector, + plintsrc->eid, + plintsrc->id, + (plintsrc->flags.polarity == 1) ? IOSAPIC_POL_HIGH : IOSAPIC_POL_LOW, + (plintsrc->flags.trigger == 1) ? IOSAPIC_EDGE : IOSAPIC_LEVEL); + + platform_intr_list[plintsrc->type] = vector; + return 0; +} + + +static int __init +acpi_parse_int_src_ovr ( + acpi_table_entry_header *header, const unsigned long end) +{ + struct acpi_table_int_src_ovr *p; + + p = (struct acpi_table_int_src_ovr *) header; + + if (BAD_MADT_ENTRY(p, end)) + return -EINVAL; + + iosapic_override_isa_irq(p->bus_irq, p->global_irq, + (p->flags.polarity == 1) ? IOSAPIC_POL_HIGH : IOSAPIC_POL_LOW, + (p->flags.trigger == 1) ? IOSAPIC_EDGE : IOSAPIC_LEVEL); + return 0; +} + + +static int __init +acpi_parse_nmi_src (acpi_table_entry_header *header, const unsigned long end) +{ + struct acpi_table_nmi_src *nmi_src; + + nmi_src = (struct acpi_table_nmi_src*) header; + + if (BAD_MADT_ENTRY(nmi_src, end)) + return -EINVAL; + + /* TBD: Support nimsrc entries */ + return 0; +} + +static void __init +acpi_madt_oem_check (char *oem_id, char *oem_table_id) +{ + if (!strncmp(oem_id, "IBM", 3) && + (!strncmp(oem_table_id, "SERMOW", 6))) { + + /* + * Unfortunately ITC_DRIFT is not yet part of the + * official SAL spec, so the ITC_DRIFT bit is not + * set by the BIOS on this hardware. + */ + sal_platform_features |= IA64_SAL_PLATFORM_FEATURE_ITC_DRIFT; + + cyclone_setup(); + } +} + +static int __init +acpi_parse_madt (unsigned long phys_addr, unsigned long size) +{ + if (!phys_addr || !size) + return -EINVAL; + + acpi_madt = (struct acpi_table_madt *) __va(phys_addr); + + /* remember the value for reference after free_initmem() */ +#ifdef CONFIG_ITANIUM + has_8259 = 1; /* Firmware on old Itanium systems is broken */ +#else + has_8259 = acpi_madt->flags.pcat_compat; +#endif + iosapic_system_init(has_8259); + + /* Get base address of IPI Message Block */ + + if (acpi_madt->lapic_address) + ipi_base_addr = ioremap(acpi_madt->lapic_address, 0); + + printk(KERN_INFO PREFIX "Local APIC address %p\n", ipi_base_addr); + + acpi_madt_oem_check(acpi_madt->header.oem_id, + acpi_madt->header.oem_table_id); + + return 0; +} + + +#ifdef CONFIG_ACPI_NUMA + +#undef SLIT_DEBUG + +#define PXM_FLAG_LEN ((MAX_PXM_DOMAINS + 1)/32) + +static int __initdata srat_num_cpus; /* number of cpus */ +static u32 __devinitdata pxm_flag[PXM_FLAG_LEN]; +#define pxm_bit_set(bit) (set_bit(bit,(void *)pxm_flag)) +#define pxm_bit_test(bit) (test_bit(bit,(void *)pxm_flag)) +/* maps to convert between proximity domain and logical node ID */ +int __devinitdata pxm_to_nid_map[MAX_PXM_DOMAINS]; +int __initdata nid_to_pxm_map[MAX_NUMNODES]; +static struct acpi_table_slit __initdata *slit_table; + +/* + * ACPI 2.0 SLIT (System Locality Information Table) + * http://devresource.hp.com/devresource/Docs/TechPapers/IA64/slit.pdf + */ +void __init +acpi_numa_slit_init (struct acpi_table_slit *slit) +{ + u32 len; + + len = sizeof(struct acpi_table_header) + 8 + + slit->localities * slit->localities; + if (slit->header.length != len) { + printk(KERN_ERR "ACPI 2.0 SLIT: size mismatch: %d expected, %d actual\n", + len, slit->header.length); + memset(numa_slit, 10, sizeof(numa_slit)); + return; + } + slit_table = slit; +} + +void __init +acpi_numa_processor_affinity_init (struct acpi_table_processor_affinity *pa) +{ + /* record this node in proximity bitmap */ + pxm_bit_set(pa->proximity_domain); + + node_cpuid[srat_num_cpus].phys_id = (pa->apic_id << 8) | (pa->lsapic_eid); + /* nid should be overridden as logical node id later */ + node_cpuid[srat_num_cpus].nid = pa->proximity_domain; + srat_num_cpus++; +} + +void __init +acpi_numa_memory_affinity_init (struct acpi_table_memory_affinity *ma) +{ + unsigned long paddr, size; + u8 pxm; + struct node_memblk_s *p, *q, *pend; + + pxm = ma->proximity_domain; + + /* fill node memory chunk structure */ + paddr = ma->base_addr_hi; + paddr = (paddr << 32) | ma->base_addr_lo; + size = ma->length_hi; + size = (size << 32) | ma->length_lo; + + /* Ignore disabled entries */ + if (!ma->flags.enabled) + return; + + /* record this node in proximity bitmap */ + pxm_bit_set(pxm); + + /* Insertion sort based on base address */ + pend = &node_memblk[num_node_memblks]; + for (p = &node_memblk[0]; p < pend; p++) { + if (paddr < p->start_paddr) + break; + } + if (p < pend) { + for (q = pend - 1; q >= p; q--) + *(q + 1) = *q; + } + p->start_paddr = paddr; + p->size = size; + p->nid = pxm; + num_node_memblks++; +} + +void __init +acpi_numa_arch_fixup (void) +{ + int i, j, node_from, node_to; + + /* If there's no SRAT, fix the phys_id and mark node 0 online */ + if (srat_num_cpus == 0) { + node_set_online(0); + node_cpuid[0].phys_id = hard_smp_processor_id(); + return; + } + + /* + * MCD - This can probably be dropped now. No need for pxm ID to node ID + * mapping with sparse node numbering iff MAX_PXM_DOMAINS <= MAX_NUMNODES. + */ + /* calculate total number of nodes in system from PXM bitmap */ + memset(pxm_to_nid_map, -1, sizeof(pxm_to_nid_map)); + memset(nid_to_pxm_map, -1, sizeof(nid_to_pxm_map)); + nodes_clear(node_online_map); + for (i = 0; i < MAX_PXM_DOMAINS; i++) { + if (pxm_bit_test(i)) { + int nid = num_online_nodes(); + pxm_to_nid_map[i] = nid; + nid_to_pxm_map[nid] = i; + node_set_online(nid); + } + } + + /* set logical node id in memory chunk structure */ + for (i = 0; i < num_node_memblks; i++) + node_memblk[i].nid = pxm_to_nid_map[node_memblk[i].nid]; + + /* assign memory bank numbers for each chunk on each node */ + for_each_online_node(i) { + int bank; + + bank = 0; + for (j = 0; j < num_node_memblks; j++) + if (node_memblk[j].nid == i) + node_memblk[j].bank = bank++; + } + + /* set logical node id in cpu structure */ + for (i = 0; i < srat_num_cpus; i++) + node_cpuid[i].nid = pxm_to_nid_map[node_cpuid[i].nid]; + + printk(KERN_INFO "Number of logical nodes in system = %d\n", num_online_nodes()); + printk(KERN_INFO "Number of memory chunks in system = %d\n", num_node_memblks); + + if (!slit_table) return; + memset(numa_slit, -1, sizeof(numa_slit)); + for (i=0; i<slit_table->localities; i++) { + if (!pxm_bit_test(i)) + continue; + node_from = pxm_to_nid_map[i]; + for (j=0; j<slit_table->localities; j++) { + if (!pxm_bit_test(j)) + continue; + node_to = pxm_to_nid_map[j]; + node_distance(node_from, node_to) = + slit_table->entry[i*slit_table->localities + j]; + } + } + +#ifdef SLIT_DEBUG + printk("ACPI 2.0 SLIT locality table:\n"); + for_each_online_node(i) { + for_each_online_node(j) + printk("%03d ", node_distance(i,j)); + printk("\n"); + } +#endif +} +#endif /* CONFIG_ACPI_NUMA */ + +unsigned int +acpi_register_gsi (u32 gsi, int edge_level, int active_high_low) +{ + if (has_8259 && gsi < 16) + return isa_irq_to_vector(gsi); + + return iosapic_register_intr(gsi, + (active_high_low == ACPI_ACTIVE_HIGH) ? IOSAPIC_POL_HIGH : IOSAPIC_POL_LOW, + (edge_level == ACPI_EDGE_SENSITIVE) ? IOSAPIC_EDGE : IOSAPIC_LEVEL); +} +EXPORT_SYMBOL(acpi_register_gsi); + +#ifdef CONFIG_ACPI_DEALLOCATE_IRQ +void +acpi_unregister_gsi (u32 gsi) +{ + iosapic_unregister_intr(gsi); +} +EXPORT_SYMBOL(acpi_unregister_gsi); +#endif /* CONFIG_ACPI_DEALLOCATE_IRQ */ + +static int __init +acpi_parse_fadt (unsigned long phys_addr, unsigned long size) +{ + struct acpi_table_header *fadt_header; + struct fadt_descriptor_rev2 *fadt; + + if (!phys_addr || !size) + return -EINVAL; + + fadt_header = (struct acpi_table_header *) __va(phys_addr); + if (fadt_header->revision != 3) + return -ENODEV; /* Only deal with ACPI 2.0 FADT */ + + fadt = (struct fadt_descriptor_rev2 *) fadt_header; + + if (!(fadt->iapc_boot_arch & BAF_8042_KEYBOARD_CONTROLLER)) + acpi_kbd_controller_present = 0; + + if (fadt->iapc_boot_arch & BAF_LEGACY_DEVICES) + acpi_legacy_devices = 1; + + acpi_register_gsi(fadt->sci_int, ACPI_LEVEL_SENSITIVE, ACPI_ACTIVE_LOW); + return 0; +} + + +unsigned long __init +acpi_find_rsdp (void) +{ + unsigned long rsdp_phys = 0; + + if (efi.acpi20) + rsdp_phys = __pa(efi.acpi20); + else if (efi.acpi) + printk(KERN_WARNING PREFIX "v1.0/r0.71 tables no longer supported\n"); + return rsdp_phys; +} + + +int __init +acpi_boot_init (void) +{ + + /* + * MADT + * ---- + * Parse the Multiple APIC Description Table (MADT), if exists. + * Note that this table provides platform SMP configuration + * information -- the successor to MPS tables. + */ + + if (acpi_table_parse(ACPI_APIC, acpi_parse_madt) < 1) { + printk(KERN_ERR PREFIX "Can't find MADT\n"); + goto skip_madt; + } + + /* Local APIC */ + + if (acpi_table_parse_madt(ACPI_MADT_LAPIC_ADDR_OVR, acpi_parse_lapic_addr_ovr, 0) < 0) + printk(KERN_ERR PREFIX "Error parsing LAPIC address override entry\n"); + + if (acpi_table_parse_madt(ACPI_MADT_LSAPIC, acpi_parse_lsapic, NR_CPUS) < 1) + printk(KERN_ERR PREFIX "Error parsing MADT - no LAPIC entries\n"); + + if (acpi_table_parse_madt(ACPI_MADT_LAPIC_NMI, acpi_parse_lapic_nmi, 0) < 0) + printk(KERN_ERR PREFIX "Error parsing LAPIC NMI entry\n"); + + /* I/O APIC */ + + if (acpi_table_parse_madt(ACPI_MADT_IOSAPIC, acpi_parse_iosapic, NR_IOSAPICS) < 1) + printk(KERN_ERR PREFIX "Error parsing MADT - no IOSAPIC entries\n"); + + /* System-Level Interrupt Routing */ + + if (acpi_table_parse_madt(ACPI_MADT_PLAT_INT_SRC, acpi_parse_plat_int_src, ACPI_MAX_PLATFORM_INTERRUPTS) < 0) + printk(KERN_ERR PREFIX "Error parsing platform interrupt source entry\n"); + + if (acpi_table_parse_madt(ACPI_MADT_INT_SRC_OVR, acpi_parse_int_src_ovr, 0) < 0) + printk(KERN_ERR PREFIX "Error parsing interrupt source overrides entry\n"); + + if (acpi_table_parse_madt(ACPI_MADT_NMI_SRC, acpi_parse_nmi_src, 0) < 0) + printk(KERN_ERR PREFIX "Error parsing NMI SRC entry\n"); + skip_madt: + + /* + * FADT says whether a legacy keyboard controller is present. + * The FADT also contains an SCI_INT line, by which the system + * gets interrupts such as power and sleep buttons. If it's not + * on a Legacy interrupt, it needs to be setup. + */ + if (acpi_table_parse(ACPI_FADT, acpi_parse_fadt) < 1) + printk(KERN_ERR PREFIX "Can't find FADT\n"); + +#ifdef CONFIG_SMP + if (available_cpus == 0) { + printk(KERN_INFO "ACPI: Found 0 CPUS; assuming 1\n"); + printk(KERN_INFO "CPU 0 (0x%04x)", hard_smp_processor_id()); + smp_boot_data.cpu_phys_id[available_cpus] = hard_smp_processor_id(); + available_cpus = 1; /* We've got at least one of these, no? */ + } + smp_boot_data.cpu_count = available_cpus; + + smp_build_cpu_map(); +# ifdef CONFIG_ACPI_NUMA + if (srat_num_cpus == 0) { + int cpu, i = 1; + for (cpu = 0; cpu < smp_boot_data.cpu_count; cpu++) + if (smp_boot_data.cpu_phys_id[cpu] != hard_smp_processor_id()) + node_cpuid[i++].phys_id = smp_boot_data.cpu_phys_id[cpu]; + } + build_cpu_to_node_map(); +# endif +#endif + /* Make boot-up look pretty */ + printk(KERN_INFO "%d CPUs available, %d CPUs total\n", available_cpus, total_cpus); + return 0; +} + +int +acpi_gsi_to_irq (u32 gsi, unsigned int *irq) +{ + int vector; + + if (has_8259 && gsi < 16) + *irq = isa_irq_to_vector(gsi); + else { + vector = gsi_to_vector(gsi); + if (vector == -1) + return -1; + + *irq = vector; + } + return 0; +} + +/* + * ACPI based hotplug CPU support + */ +#ifdef CONFIG_ACPI_HOTPLUG_CPU +static +int +acpi_map_cpu2node(acpi_handle handle, int cpu, long physid) +{ +#ifdef CONFIG_ACPI_NUMA + int pxm_id; + + pxm_id = acpi_get_pxm(handle); + + /* + * Assuming that the container driver would have set the proximity + * domain and would have initialized pxm_to_nid_map[pxm_id] && pxm_flag + */ + node_cpuid[cpu].nid = (pxm_id < 0) ? 0: + pxm_to_nid_map[pxm_id]; + + node_cpuid[cpu].phys_id = physid; +#endif + return(0); +} + + +int +acpi_map_lsapic(acpi_handle handle, int *pcpu) +{ + struct acpi_buffer buffer = {ACPI_ALLOCATE_BUFFER, NULL}; + union acpi_object *obj; + struct acpi_table_lsapic *lsapic; + cpumask_t tmp_map; + long physid; + int cpu; + + if (ACPI_FAILURE(acpi_evaluate_object(handle, "_MAT", NULL, &buffer))) + return -EINVAL; + + if (!buffer.length || !buffer.pointer) + return -EINVAL; + + obj = buffer.pointer; + if (obj->type != ACPI_TYPE_BUFFER || + obj->buffer.length < sizeof(*lsapic)) { + acpi_os_free(buffer.pointer); + return -EINVAL; + } + + lsapic = (struct acpi_table_lsapic *)obj->buffer.pointer; + + if ((lsapic->header.type != ACPI_MADT_LSAPIC) || + (!lsapic->flags.enabled)) { + acpi_os_free(buffer.pointer); + return -EINVAL; + } + + physid = ((lsapic->id <<8) | (lsapic->eid)); + + acpi_os_free(buffer.pointer); + buffer.length = ACPI_ALLOCATE_BUFFER; + buffer.pointer = NULL; + + cpus_complement(tmp_map, cpu_present_map); + cpu = first_cpu(tmp_map); + if(cpu >= NR_CPUS) + return -EINVAL; + + acpi_map_cpu2node(handle, cpu, physid); + + cpu_set(cpu, cpu_present_map); + ia64_cpu_to_sapicid[cpu] = physid; + ia64_acpiid_to_sapicid[lsapic->acpi_id] = ia64_cpu_to_sapicid[cpu]; + + *pcpu = cpu; + return(0); +} +EXPORT_SYMBOL(acpi_map_lsapic); + + +int +acpi_unmap_lsapic(int cpu) +{ + int i; + + for (i=0; i<MAX_SAPICS; i++) { + if (ia64_acpiid_to_sapicid[i] == ia64_cpu_to_sapicid[cpu]) { + ia64_acpiid_to_sapicid[i] = -1; + break; + } + } + ia64_cpu_to_sapicid[cpu] = -1; + cpu_clear(cpu,cpu_present_map); + +#ifdef CONFIG_ACPI_NUMA + /* NUMA specific cleanup's */ +#endif + + return(0); +} +EXPORT_SYMBOL(acpi_unmap_lsapic); +#endif /* CONFIG_ACPI_HOTPLUG_CPU */ + + +#ifdef CONFIG_ACPI_NUMA +acpi_status __init +acpi_map_iosapic (acpi_handle handle, u32 depth, void *context, void **ret) +{ + struct acpi_buffer buffer = {ACPI_ALLOCATE_BUFFER, NULL}; + union acpi_object *obj; + struct acpi_table_iosapic *iosapic; + unsigned int gsi_base; + int node; + + /* Only care about objects w/ a method that returns the MADT */ + if (ACPI_FAILURE(acpi_evaluate_object(handle, "_MAT", NULL, &buffer))) + return AE_OK; + + if (!buffer.length || !buffer.pointer) + return AE_OK; + + obj = buffer.pointer; + if (obj->type != ACPI_TYPE_BUFFER || + obj->buffer.length < sizeof(*iosapic)) { + acpi_os_free(buffer.pointer); + return AE_OK; + } + + iosapic = (struct acpi_table_iosapic *)obj->buffer.pointer; + + if (iosapic->header.type != ACPI_MADT_IOSAPIC) { + acpi_os_free(buffer.pointer); + return AE_OK; + } + + gsi_base = iosapic->global_irq_base; + + acpi_os_free(buffer.pointer); + buffer.length = ACPI_ALLOCATE_BUFFER; + buffer.pointer = NULL; + + /* + * OK, it's an IOSAPIC MADT entry, look for a _PXM method to tell + * us which node to associate this with. + */ + if (ACPI_FAILURE(acpi_evaluate_object(handle, "_PXM", NULL, &buffer))) + return AE_OK; + + if (!buffer.length || !buffer.pointer) + return AE_OK; + + obj = buffer.pointer; + + if (obj->type != ACPI_TYPE_INTEGER || + obj->integer.value >= MAX_PXM_DOMAINS) { + acpi_os_free(buffer.pointer); + return AE_OK; + } + + node = pxm_to_nid_map[obj->integer.value]; + acpi_os_free(buffer.pointer); + + if (node >= MAX_NUMNODES || !node_online(node) || + cpus_empty(node_to_cpumask(node))) + return AE_OK; + + /* We know a gsi to node mapping! */ + map_iosapic_to_node(gsi_base, node); + return AE_OK; +} +#endif /* CONFIG_NUMA */ +#endif /* CONFIG_ACPI_BOOT */ diff --git a/arch/ia64/kernel/asm-offsets.c b/arch/ia64/kernel/asm-offsets.c new file mode 100644 index 000000000000..7d1ae2982c53 --- /dev/null +++ b/arch/ia64/kernel/asm-offsets.c @@ -0,0 +1,239 @@ +/* + * Generate definitions needed by assembly language modules. + * This code generates raw asm output which is post-processed + * to extract and format the required data. + */ + +#include <linux/config.h> + +#include <linux/sched.h> + +#include <asm-ia64/processor.h> +#include <asm-ia64/ptrace.h> +#include <asm-ia64/siginfo.h> +#include <asm-ia64/sigcontext.h> +#include <asm-ia64/mca.h> + +#include "../kernel/sigframe.h" + +#define DEFINE(sym, val) \ + asm volatile("\n->" #sym " %0 " #val : : "i" (val)) + +#define BLANK() asm volatile("\n->" : : ) + +void foo(void) +{ + DEFINE(IA64_TASK_SIZE, sizeof (struct task_struct)); + DEFINE(IA64_THREAD_INFO_SIZE, sizeof (struct thread_info)); + DEFINE(IA64_PT_REGS_SIZE, sizeof (struct pt_regs)); + DEFINE(IA64_SWITCH_STACK_SIZE, sizeof (struct switch_stack)); + DEFINE(IA64_SIGINFO_SIZE, sizeof (struct siginfo)); + DEFINE(IA64_CPU_SIZE, sizeof (struct cpuinfo_ia64)); + DEFINE(SIGFRAME_SIZE, sizeof (struct sigframe)); + DEFINE(UNW_FRAME_INFO_SIZE, sizeof (struct unw_frame_info)); + + BLANK(); + + DEFINE(TI_FLAGS, offsetof(struct thread_info, flags)); + DEFINE(TI_PRE_COUNT, offsetof(struct thread_info, preempt_count)); + + BLANK(); + + DEFINE(IA64_TASK_BLOCKED_OFFSET,offsetof (struct task_struct, blocked)); + DEFINE(IA64_TASK_CLEAR_CHILD_TID_OFFSET,offsetof (struct task_struct, clear_child_tid)); + DEFINE(IA64_TASK_GROUP_LEADER_OFFSET, offsetof (struct task_struct, group_leader)); + DEFINE(IA64_TASK_PENDING_OFFSET,offsetof (struct task_struct, pending)); + DEFINE(IA64_TASK_PID_OFFSET, offsetof (struct task_struct, pid)); + DEFINE(IA64_TASK_REAL_PARENT_OFFSET, offsetof (struct task_struct, real_parent)); + DEFINE(IA64_TASK_SIGHAND_OFFSET,offsetof (struct task_struct, sighand)); + DEFINE(IA64_TASK_SIGNAL_OFFSET,offsetof (struct task_struct, signal)); + DEFINE(IA64_TASK_TGID_OFFSET, offsetof (struct task_struct, tgid)); + DEFINE(IA64_TASK_THREAD_KSP_OFFSET, offsetof (struct task_struct, thread.ksp)); + DEFINE(IA64_TASK_THREAD_ON_USTACK_OFFSET, offsetof (struct task_struct, thread.on_ustack)); + + BLANK(); + + DEFINE(IA64_SIGHAND_SIGLOCK_OFFSET,offsetof (struct sighand_struct, siglock)); + + BLANK(); + + DEFINE(IA64_SIGNAL_GROUP_STOP_COUNT_OFFSET,offsetof (struct signal_struct, + group_stop_count)); + DEFINE(IA64_SIGNAL_SHARED_PENDING_OFFSET,offsetof (struct signal_struct, shared_pending)); + + BLANK(); + + DEFINE(IA64_PT_REGS_B6_OFFSET, offsetof (struct pt_regs, b6)); + DEFINE(IA64_PT_REGS_B7_OFFSET, offsetof (struct pt_regs, b7)); + DEFINE(IA64_PT_REGS_AR_CSD_OFFSET, offsetof (struct pt_regs, ar_csd)); + DEFINE(IA64_PT_REGS_AR_SSD_OFFSET, offsetof (struct pt_regs, ar_ssd)); + DEFINE(IA64_PT_REGS_R8_OFFSET, offsetof (struct pt_regs, r8)); + DEFINE(IA64_PT_REGS_R9_OFFSET, offsetof (struct pt_regs, r9)); + DEFINE(IA64_PT_REGS_R10_OFFSET, offsetof (struct pt_regs, r10)); + DEFINE(IA64_PT_REGS_R11_OFFSET, offsetof (struct pt_regs, r11)); + DEFINE(IA64_PT_REGS_CR_IPSR_OFFSET, offsetof (struct pt_regs, cr_ipsr)); + DEFINE(IA64_PT_REGS_CR_IIP_OFFSET, offsetof (struct pt_regs, cr_iip)); + DEFINE(IA64_PT_REGS_CR_IFS_OFFSET, offsetof (struct pt_regs, cr_ifs)); + DEFINE(IA64_PT_REGS_AR_UNAT_OFFSET, offsetof (struct pt_regs, ar_unat)); + DEFINE(IA64_PT_REGS_AR_PFS_OFFSET, offsetof (struct pt_regs, ar_pfs)); + DEFINE(IA64_PT_REGS_AR_RSC_OFFSET, offsetof (struct pt_regs, ar_rsc)); + DEFINE(IA64_PT_REGS_AR_RNAT_OFFSET, offsetof (struct pt_regs, ar_rnat)); + + DEFINE(IA64_PT_REGS_AR_BSPSTORE_OFFSET, offsetof (struct pt_regs, ar_bspstore)); + DEFINE(IA64_PT_REGS_PR_OFFSET, offsetof (struct pt_regs, pr)); + DEFINE(IA64_PT_REGS_B0_OFFSET, offsetof (struct pt_regs, b0)); + DEFINE(IA64_PT_REGS_LOADRS_OFFSET, offsetof (struct pt_regs, loadrs)); + DEFINE(IA64_PT_REGS_R1_OFFSET, offsetof (struct pt_regs, r1)); + DEFINE(IA64_PT_REGS_R12_OFFSET, offsetof (struct pt_regs, r12)); + DEFINE(IA64_PT_REGS_R13_OFFSET, offsetof (struct pt_regs, r13)); + DEFINE(IA64_PT_REGS_AR_FPSR_OFFSET, offsetof (struct pt_regs, ar_fpsr)); + DEFINE(IA64_PT_REGS_R15_OFFSET, offsetof (struct pt_regs, r15)); + DEFINE(IA64_PT_REGS_R14_OFFSET, offsetof (struct pt_regs, r14)); + DEFINE(IA64_PT_REGS_R2_OFFSET, offsetof (struct pt_regs, r2)); + DEFINE(IA64_PT_REGS_R3_OFFSET, offsetof (struct pt_regs, r3)); + DEFINE(IA64_PT_REGS_R16_OFFSET, offsetof (struct pt_regs, r16)); + DEFINE(IA64_PT_REGS_R17_OFFSET, offsetof (struct pt_regs, r17)); + DEFINE(IA64_PT_REGS_R18_OFFSET, offsetof (struct pt_regs, r18)); + DEFINE(IA64_PT_REGS_R19_OFFSET, offsetof (struct pt_regs, r19)); + DEFINE(IA64_PT_REGS_R20_OFFSET, offsetof (struct pt_regs, r20)); + DEFINE(IA64_PT_REGS_R21_OFFSET, offsetof (struct pt_regs, r21)); + DEFINE(IA64_PT_REGS_R22_OFFSET, offsetof (struct pt_regs, r22)); + DEFINE(IA64_PT_REGS_R23_OFFSET, offsetof (struct pt_regs, r23)); + DEFINE(IA64_PT_REGS_R24_OFFSET, offsetof (struct pt_regs, r24)); + DEFINE(IA64_PT_REGS_R25_OFFSET, offsetof (struct pt_regs, r25)); + DEFINE(IA64_PT_REGS_R26_OFFSET, offsetof (struct pt_regs, r26)); + DEFINE(IA64_PT_REGS_R27_OFFSET, offsetof (struct pt_regs, r27)); + DEFINE(IA64_PT_REGS_R28_OFFSET, offsetof (struct pt_regs, r28)); + DEFINE(IA64_PT_REGS_R29_OFFSET, offsetof (struct pt_regs, r29)); + DEFINE(IA64_PT_REGS_R30_OFFSET, offsetof (struct pt_regs, r30)); + DEFINE(IA64_PT_REGS_R31_OFFSET, offsetof (struct pt_regs, r31)); + DEFINE(IA64_PT_REGS_AR_CCV_OFFSET, offsetof (struct pt_regs, ar_ccv)); + DEFINE(IA64_PT_REGS_F6_OFFSET, offsetof (struct pt_regs, f6)); + DEFINE(IA64_PT_REGS_F7_OFFSET, offsetof (struct pt_regs, f7)); + DEFINE(IA64_PT_REGS_F8_OFFSET, offsetof (struct pt_regs, f8)); + DEFINE(IA64_PT_REGS_F9_OFFSET, offsetof (struct pt_regs, f9)); + DEFINE(IA64_PT_REGS_F10_OFFSET, offsetof (struct pt_regs, f10)); + DEFINE(IA64_PT_REGS_F11_OFFSET, offsetof (struct pt_regs, f11)); + + BLANK(); + + DEFINE(IA64_SWITCH_STACK_CALLER_UNAT_OFFSET, offsetof (struct switch_stack, caller_unat)); + DEFINE(IA64_SWITCH_STACK_AR_FPSR_OFFSET, offsetof (struct switch_stack, ar_fpsr)); + DEFINE(IA64_SWITCH_STACK_F2_OFFSET, offsetof (struct switch_stack, f2)); + DEFINE(IA64_SWITCH_STACK_F3_OFFSET, offsetof (struct switch_stack, f3)); + DEFINE(IA64_SWITCH_STACK_F4_OFFSET, offsetof (struct switch_stack, f4)); + DEFINE(IA64_SWITCH_STACK_F5_OFFSET, offsetof (struct switch_stack, f5)); + DEFINE(IA64_SWITCH_STACK_F12_OFFSET, offsetof (struct switch_stack, f12)); + DEFINE(IA64_SWITCH_STACK_F13_OFFSET, offsetof (struct switch_stack, f13)); + DEFINE(IA64_SWITCH_STACK_F14_OFFSET, offsetof (struct switch_stack, f14)); + DEFINE(IA64_SWITCH_STACK_F15_OFFSET, offsetof (struct switch_stack, f15)); + DEFINE(IA64_SWITCH_STACK_F16_OFFSET, offsetof (struct switch_stack, f16)); + DEFINE(IA64_SWITCH_STACK_F17_OFFSET, offsetof (struct switch_stack, f17)); + DEFINE(IA64_SWITCH_STACK_F18_OFFSET, offsetof (struct switch_stack, f18)); + DEFINE(IA64_SWITCH_STACK_F19_OFFSET, offsetof (struct switch_stack, f19)); + DEFINE(IA64_SWITCH_STACK_F20_OFFSET, offsetof (struct switch_stack, f20)); + DEFINE(IA64_SWITCH_STACK_F21_OFFSET, offsetof (struct switch_stack, f21)); + DEFINE(IA64_SWITCH_STACK_F22_OFFSET, offsetof (struct switch_stack, f22)); + DEFINE(IA64_SWITCH_STACK_F23_OFFSET, offsetof (struct switch_stack, f23)); + DEFINE(IA64_SWITCH_STACK_F24_OFFSET, offsetof (struct switch_stack, f24)); + DEFINE(IA64_SWITCH_STACK_F25_OFFSET, offsetof (struct switch_stack, f25)); + DEFINE(IA64_SWITCH_STACK_F26_OFFSET, offsetof (struct switch_stack, f26)); + DEFINE(IA64_SWITCH_STACK_F27_OFFSET, offsetof (struct switch_stack, f27)); + DEFINE(IA64_SWITCH_STACK_F28_OFFSET, offsetof (struct switch_stack, f28)); + DEFINE(IA64_SWITCH_STACK_F29_OFFSET, offsetof (struct switch_stack, f29)); + DEFINE(IA64_SWITCH_STACK_F30_OFFSET, offsetof (struct switch_stack, f30)); + DEFINE(IA64_SWITCH_STACK_F31_OFFSET, offsetof (struct switch_stack, f31)); + DEFINE(IA64_SWITCH_STACK_R4_OFFSET, offsetof (struct switch_stack, r4)); + DEFINE(IA64_SWITCH_STACK_R5_OFFSET, offsetof (struct switch_stack, r5)); + DEFINE(IA64_SWITCH_STACK_R6_OFFSET, offsetof (struct switch_stack, r6)); + DEFINE(IA64_SWITCH_STACK_R7_OFFSET, offsetof (struct switch_stack, r7)); + DEFINE(IA64_SWITCH_STACK_B0_OFFSET, offsetof (struct switch_stack, b0)); + DEFINE(IA64_SWITCH_STACK_B1_OFFSET, offsetof (struct switch_stack, b1)); + DEFINE(IA64_SWITCH_STACK_B2_OFFSET, offsetof (struct switch_stack, b2)); + DEFINE(IA64_SWITCH_STACK_B3_OFFSET, offsetof (struct switch_stack, b3)); + DEFINE(IA64_SWITCH_STACK_B4_OFFSET, offsetof (struct switch_stack, b4)); + DEFINE(IA64_SWITCH_STACK_B5_OFFSET, offsetof (struct switch_stack, b5)); + DEFINE(IA64_SWITCH_STACK_AR_PFS_OFFSET, offsetof (struct switch_stack, ar_pfs)); + DEFINE(IA64_SWITCH_STACK_AR_LC_OFFSET, offsetof (struct switch_stack, ar_lc)); + DEFINE(IA64_SWITCH_STACK_AR_UNAT_OFFSET, offsetof (struct switch_stack, ar_unat)); + DEFINE(IA64_SWITCH_STACK_AR_RNAT_OFFSET, offsetof (struct switch_stack, ar_rnat)); + DEFINE(IA64_SWITCH_STACK_AR_BSPSTORE_OFFSET, offsetof (struct switch_stack, ar_bspstore)); + DEFINE(IA64_SWITCH_STACK_PR_OFFSET, offsetof (struct switch_stack, pr)); + + BLANK(); + + DEFINE(IA64_SIGCONTEXT_IP_OFFSET, offsetof (struct sigcontext, sc_ip)); + DEFINE(IA64_SIGCONTEXT_AR_BSP_OFFSET, offsetof (struct sigcontext, sc_ar_bsp)); + DEFINE(IA64_SIGCONTEXT_AR_FPSR_OFFSET, offsetof (struct sigcontext, sc_ar_fpsr)); + DEFINE(IA64_SIGCONTEXT_AR_RNAT_OFFSET, offsetof (struct sigcontext, sc_ar_rnat)); + DEFINE(IA64_SIGCONTEXT_AR_UNAT_OFFSET, offsetof (struct sigcontext, sc_ar_unat)); + DEFINE(IA64_SIGCONTEXT_B0_OFFSET, offsetof (struct sigcontext, sc_br[0])); + DEFINE(IA64_SIGCONTEXT_CFM_OFFSET, offsetof (struct sigcontext, sc_cfm)); + DEFINE(IA64_SIGCONTEXT_FLAGS_OFFSET, offsetof (struct sigcontext, sc_flags)); + DEFINE(IA64_SIGCONTEXT_FR6_OFFSET, offsetof (struct sigcontext, sc_fr[6])); + DEFINE(IA64_SIGCONTEXT_PR_OFFSET, offsetof (struct sigcontext, sc_pr)); + DEFINE(IA64_SIGCONTEXT_R12_OFFSET, offsetof (struct sigcontext, sc_gr[12])); + DEFINE(IA64_SIGCONTEXT_RBS_BASE_OFFSET,offsetof (struct sigcontext, sc_rbs_base)); + DEFINE(IA64_SIGCONTEXT_LOADRS_OFFSET, offsetof (struct sigcontext, sc_loadrs)); + + BLANK(); + + DEFINE(IA64_SIGPENDING_SIGNAL_OFFSET, offsetof (struct sigpending, signal)); + + BLANK(); + + DEFINE(IA64_SIGFRAME_ARG0_OFFSET, offsetof (struct sigframe, arg0)); + DEFINE(IA64_SIGFRAME_ARG1_OFFSET, offsetof (struct sigframe, arg1)); + DEFINE(IA64_SIGFRAME_ARG2_OFFSET, offsetof (struct sigframe, arg2)); + DEFINE(IA64_SIGFRAME_HANDLER_OFFSET, offsetof (struct sigframe, handler)); + DEFINE(IA64_SIGFRAME_SIGCONTEXT_OFFSET, offsetof (struct sigframe, sc)); + BLANK(); + /* for assembly files which can't include sched.h: */ + DEFINE(IA64_CLONE_VFORK, CLONE_VFORK); + DEFINE(IA64_CLONE_VM, CLONE_VM); + + BLANK(); + DEFINE(IA64_CPUINFO_NSEC_PER_CYC_OFFSET, + offsetof (struct cpuinfo_ia64, nsec_per_cyc)); + DEFINE(IA64_CPUINFO_PTCE_BASE_OFFSET, + offsetof (struct cpuinfo_ia64, ptce_base)); + DEFINE(IA64_CPUINFO_PTCE_COUNT_OFFSET, + offsetof (struct cpuinfo_ia64, ptce_count)); + DEFINE(IA64_CPUINFO_PTCE_STRIDE_OFFSET, + offsetof (struct cpuinfo_ia64, ptce_stride)); + BLANK(); + DEFINE(IA64_TIMESPEC_TV_NSEC_OFFSET, + offsetof (struct timespec, tv_nsec)); + + DEFINE(CLONE_SETTLS_BIT, 19); +#if CLONE_SETTLS != (1<<19) +# error "CLONE_SETTLS_BIT incorrect, please fix" +#endif + + BLANK(); + DEFINE(IA64_MCA_CPU_PROC_STATE_DUMP_OFFSET, + offsetof (struct ia64_mca_cpu, proc_state_dump)); + DEFINE(IA64_MCA_CPU_STACK_OFFSET, + offsetof (struct ia64_mca_cpu, stack)); + DEFINE(IA64_MCA_CPU_STACKFRAME_OFFSET, + offsetof (struct ia64_mca_cpu, stackframe)); + DEFINE(IA64_MCA_CPU_RBSTORE_OFFSET, + offsetof (struct ia64_mca_cpu, rbstore)); + DEFINE(IA64_MCA_CPU_INIT_STACK_OFFSET, + offsetof (struct ia64_mca_cpu, init_stack)); + BLANK(); + /* used by fsys_gettimeofday in arch/ia64/kernel/fsys.S */ + DEFINE(IA64_TIME_INTERPOLATOR_ADDRESS_OFFSET, offsetof (struct time_interpolator, addr)); + DEFINE(IA64_TIME_INTERPOLATOR_SOURCE_OFFSET, offsetof (struct time_interpolator, source)); + DEFINE(IA64_TIME_INTERPOLATOR_SHIFT_OFFSET, offsetof (struct time_interpolator, shift)); + DEFINE(IA64_TIME_INTERPOLATOR_NSEC_OFFSET, offsetof (struct time_interpolator, nsec_per_cyc)); + DEFINE(IA64_TIME_INTERPOLATOR_OFFSET_OFFSET, offsetof (struct time_interpolator, offset)); + DEFINE(IA64_TIME_INTERPOLATOR_LAST_CYCLE_OFFSET, offsetof (struct time_interpolator, last_cycle)); + DEFINE(IA64_TIME_INTERPOLATOR_LAST_COUNTER_OFFSET, offsetof (struct time_interpolator, last_counter)); + DEFINE(IA64_TIME_INTERPOLATOR_JITTER_OFFSET, offsetof (struct time_interpolator, jitter)); + DEFINE(IA64_TIME_INTERPOLATOR_MASK_OFFSET, offsetof (struct time_interpolator, mask)); + DEFINE(IA64_TIME_SOURCE_CPU, TIME_SOURCE_CPU); + DEFINE(IA64_TIME_SOURCE_MMIO64, TIME_SOURCE_MMIO64); + DEFINE(IA64_TIME_SOURCE_MMIO32, TIME_SOURCE_MMIO32); + DEFINE(IA64_TIMESPEC_TV_NSEC_OFFSET, offsetof (struct timespec, tv_nsec)); +} diff --git a/arch/ia64/kernel/brl_emu.c b/arch/ia64/kernel/brl_emu.c new file mode 100644 index 000000000000..0b286ca164f9 --- /dev/null +++ b/arch/ia64/kernel/brl_emu.c @@ -0,0 +1,234 @@ +/* + * Emulation of the "brl" instruction for IA64 processors that + * don't support it in hardware. + * Author: Stephan Zeisset, Intel Corp. <Stephan.Zeisset@intel.com> + * + * 02/22/02 D. Mosberger Clear si_flgs, si_isr, and si_imm to avoid + * leaking kernel bits. + */ + +#include <linux/kernel.h> +#include <linux/sched.h> +#include <asm/uaccess.h> +#include <asm/processor.h> + +extern char ia64_set_b1, ia64_set_b2, ia64_set_b3, ia64_set_b4, ia64_set_b5; + +struct illegal_op_return { + unsigned long fkt, arg1, arg2, arg3; +}; + +/* + * The unimplemented bits of a virtual address must be set + * to the value of the most significant implemented bit. + * unimpl_va_mask includes all unimplemented bits and + * the most significant implemented bit, so the result + * of an and operation with the mask must be all 0's + * or all 1's for the address to be valid. + */ +#define unimplemented_virtual_address(va) ( \ + ((va) & local_cpu_data->unimpl_va_mask) != 0 && \ + ((va) & local_cpu_data->unimpl_va_mask) != local_cpu_data->unimpl_va_mask \ +) + +/* + * The unimplemented bits of a physical address must be 0. + * unimpl_pa_mask includes all unimplemented bits, so the result + * of an and operation with the mask must be all 0's for the + * address to be valid. + */ +#define unimplemented_physical_address(pa) ( \ + ((pa) & local_cpu_data->unimpl_pa_mask) != 0 \ +) + +/* + * Handle an illegal operation fault that was caused by an + * unimplemented "brl" instruction. + * If we are not successful (e.g because the illegal operation + * wasn't caused by a "brl" after all), we return -1. + * If we are successful, we return either 0 or the address + * of a "fixup" function for manipulating preserved register + * state. + */ + +struct illegal_op_return +ia64_emulate_brl (struct pt_regs *regs, unsigned long ar_ec) +{ + unsigned long bundle[2]; + unsigned long opcode, btype, qp, offset, cpl; + unsigned long next_ip; + struct siginfo siginfo; + struct illegal_op_return rv; + long tmp_taken, unimplemented_address; + + rv.fkt = (unsigned long) -1; + + /* + * Decode the instruction bundle. + */ + + if (copy_from_user(bundle, (void *) (regs->cr_iip), sizeof(bundle))) + return rv; + + next_ip = (unsigned long) regs->cr_iip + 16; + + /* "brl" must be in slot 2. */ + if (ia64_psr(regs)->ri != 1) return rv; + + /* Must be "mlx" template */ + if ((bundle[0] & 0x1e) != 0x4) return rv; + + opcode = (bundle[1] >> 60); + btype = ((bundle[1] >> 29) & 0x7); + qp = ((bundle[1] >> 23) & 0x3f); + offset = ((bundle[1] & 0x0800000000000000L) << 4) + | ((bundle[1] & 0x00fffff000000000L) >> 32) + | ((bundle[1] & 0x00000000007fffffL) << 40) + | ((bundle[0] & 0xffff000000000000L) >> 24); + + tmp_taken = regs->pr & (1L << qp); + + switch(opcode) { + + case 0xC: + /* + * Long Branch. + */ + if (btype != 0) return rv; + rv.fkt = 0; + if (!(tmp_taken)) { + /* + * Qualifying predicate is 0. + * Skip instruction. + */ + regs->cr_iip = next_ip; + ia64_psr(regs)->ri = 0; + return rv; + } + break; + + case 0xD: + /* + * Long Call. + */ + rv.fkt = 0; + if (!(tmp_taken)) { + /* + * Qualifying predicate is 0. + * Skip instruction. + */ + regs->cr_iip = next_ip; + ia64_psr(regs)->ri = 0; + return rv; + } + + /* + * BR[btype] = IP+16 + */ + switch(btype) { + case 0: + regs->b0 = next_ip; + break; + case 1: + rv.fkt = (unsigned long) &ia64_set_b1; + break; + case 2: + rv.fkt = (unsigned long) &ia64_set_b2; + break; + case 3: + rv.fkt = (unsigned long) &ia64_set_b3; + break; + case 4: + rv.fkt = (unsigned long) &ia64_set_b4; + break; + case 5: + rv.fkt = (unsigned long) &ia64_set_b5; + break; + case 6: + regs->b6 = next_ip; + break; + case 7: + regs->b7 = next_ip; + break; + } + rv.arg1 = next_ip; + + /* + * AR[PFS].pfm = CFM + * AR[PFS].pec = AR[EC] + * AR[PFS].ppl = PSR.cpl + */ + cpl = ia64_psr(regs)->cpl; + regs->ar_pfs = ((regs->cr_ifs & 0x3fffffffff) + | (ar_ec << 52) | (cpl << 62)); + + /* + * CFM.sof -= CFM.sol + * CFM.sol = 0 + * CFM.sor = 0 + * CFM.rrb.gr = 0 + * CFM.rrb.fr = 0 + * CFM.rrb.pr = 0 + */ + regs->cr_ifs = ((regs->cr_ifs & 0xffffffc00000007f) + - ((regs->cr_ifs >> 7) & 0x7f)); + + break; + + default: + /* + * Unknown opcode. + */ + return rv; + + } + + regs->cr_iip += offset; + ia64_psr(regs)->ri = 0; + + if (ia64_psr(regs)->it == 0) + unimplemented_address = unimplemented_physical_address(regs->cr_iip); + else + unimplemented_address = unimplemented_virtual_address(regs->cr_iip); + + if (unimplemented_address) { + /* + * The target address contains unimplemented bits. + */ + printk(KERN_DEBUG "Woah! Unimplemented Instruction Address Trap!\n"); + siginfo.si_signo = SIGILL; + siginfo.si_errno = 0; + siginfo.si_flags = 0; + siginfo.si_isr = 0; + siginfo.si_imm = 0; + siginfo.si_code = ILL_BADIADDR; + force_sig_info(SIGILL, &siginfo, current); + } else if (ia64_psr(regs)->tb) { + /* + * Branch Tracing is enabled. + * Force a taken branch signal. + */ + siginfo.si_signo = SIGTRAP; + siginfo.si_errno = 0; + siginfo.si_code = TRAP_BRANCH; + siginfo.si_flags = 0; + siginfo.si_isr = 0; + siginfo.si_addr = 0; + siginfo.si_imm = 0; + force_sig_info(SIGTRAP, &siginfo, current); + } else if (ia64_psr(regs)->ss) { + /* + * Single Step is enabled. + * Force a trace signal. + */ + siginfo.si_signo = SIGTRAP; + siginfo.si_errno = 0; + siginfo.si_code = TRAP_TRACE; + siginfo.si_flags = 0; + siginfo.si_isr = 0; + siginfo.si_addr = 0; + siginfo.si_imm = 0; + force_sig_info(SIGTRAP, &siginfo, current); + } + return rv; +} diff --git a/arch/ia64/kernel/cyclone.c b/arch/ia64/kernel/cyclone.c new file mode 100644 index 000000000000..768c7e46957c --- /dev/null +++ b/arch/ia64/kernel/cyclone.c @@ -0,0 +1,109 @@ +#include <linux/module.h> +#include <linux/smp.h> +#include <linux/time.h> +#include <linux/errno.h> +#include <asm/io.h> + +/* IBM Summit (EXA) Cyclone counter code*/ +#define CYCLONE_CBAR_ADDR 0xFEB00CD0 +#define CYCLONE_PMCC_OFFSET 0x51A0 +#define CYCLONE_MPMC_OFFSET 0x51D0 +#define CYCLONE_MPCS_OFFSET 0x51A8 +#define CYCLONE_TIMER_FREQ 100000000 + +int use_cyclone; +void __init cyclone_setup(void) +{ + use_cyclone = 1; +} + + +struct time_interpolator cyclone_interpolator = { + .source = TIME_SOURCE_MMIO64, + .shift = 16, + .frequency = CYCLONE_TIMER_FREQ, + .drift = -100, + .mask = (1LL << 40) - 1 +}; + +int __init init_cyclone_clock(void) +{ + u64* reg; + u64 base; /* saved cyclone base address */ + u64 offset; /* offset from pageaddr to cyclone_timer register */ + int i; + u32* volatile cyclone_timer; /* Cyclone MPMC0 register */ + + if (!use_cyclone) + return -ENODEV; + + printk(KERN_INFO "Summit chipset: Starting Cyclone Counter.\n"); + + /* find base address */ + offset = (CYCLONE_CBAR_ADDR); + reg = (u64*)ioremap_nocache(offset, sizeof(u64)); + if(!reg){ + printk(KERN_ERR "Summit chipset: Could not find valid CBAR register.\n"); + use_cyclone = 0; + return -ENODEV; + } + base = readq(reg); + if(!base){ + printk(KERN_ERR "Summit chipset: Could not find valid CBAR value.\n"); + use_cyclone = 0; + return -ENODEV; + } + iounmap(reg); + + /* setup PMCC */ + offset = (base + CYCLONE_PMCC_OFFSET); + reg = (u64*)ioremap_nocache(offset, sizeof(u64)); + if(!reg){ + printk(KERN_ERR "Summit chipset: Could not find valid PMCC register.\n"); + use_cyclone = 0; + return -ENODEV; + } + writel(0x00000001,reg); + iounmap(reg); + + /* setup MPCS */ + offset = (base + CYCLONE_MPCS_OFFSET); + reg = (u64*)ioremap_nocache(offset, sizeof(u64)); + if(!reg){ + printk(KERN_ERR "Summit chipset: Could not find valid MPCS register.\n"); + use_cyclone = 0; + return -ENODEV; + } + writel(0x00000001,reg); + iounmap(reg); + + /* map in cyclone_timer */ + offset = (base + CYCLONE_MPMC_OFFSET); + cyclone_timer = (u32*)ioremap_nocache(offset, sizeof(u32)); + if(!cyclone_timer){ + printk(KERN_ERR "Summit chipset: Could not find valid MPMC register.\n"); + use_cyclone = 0; + return -ENODEV; + } + + /*quick test to make sure its ticking*/ + for(i=0; i<3; i++){ + u32 old = readl(cyclone_timer); + int stall = 100; + while(stall--) barrier(); + if(readl(cyclone_timer) == old){ + printk(KERN_ERR "Summit chipset: Counter not counting! DISABLED\n"); + iounmap(cyclone_timer); + cyclone_timer = 0; + use_cyclone = 0; + return -ENODEV; + } + } + /* initialize last tick */ + cyclone_interpolator.addr = cyclone_timer; + register_time_interpolator(&cyclone_interpolator); + + return 0; +} + +__initcall(init_cyclone_clock); diff --git a/arch/ia64/kernel/domain.c b/arch/ia64/kernel/domain.c new file mode 100644 index 000000000000..fe532c970438 --- /dev/null +++ b/arch/ia64/kernel/domain.c @@ -0,0 +1,382 @@ +/* + * arch/ia64/kernel/domain.c + * Architecture specific sched-domains builder. + * + * Copyright (C) 2004 Jesse Barnes + * Copyright (C) 2004 Silicon Graphics, Inc. + */ + +#include <linux/sched.h> +#include <linux/percpu.h> +#include <linux/slab.h> +#include <linux/cpumask.h> +#include <linux/init.h> +#include <linux/topology.h> +#include <linux/nodemask.h> + +#define SD_NODES_PER_DOMAIN 6 + +#ifdef CONFIG_NUMA +/** + * find_next_best_node - find the next node to include in a sched_domain + * @node: node whose sched_domain we're building + * @used_nodes: nodes already in the sched_domain + * + * Find the next node to include in a given scheduling domain. Simply + * finds the closest node not already in the @used_nodes map. + * + * Should use nodemask_t. + */ +static int __devinit find_next_best_node(int node, unsigned long *used_nodes) +{ + int i, n, val, min_val, best_node = 0; + + min_val = INT_MAX; + + for (i = 0; i < MAX_NUMNODES; i++) { + /* Start at @node */ + n = (node + i) % MAX_NUMNODES; + + if (!nr_cpus_node(n)) + continue; + + /* Skip already used nodes */ + if (test_bit(n, used_nodes)) + continue; + + /* Simple min distance search */ + val = node_distance(node, n); + + if (val < min_val) { + min_val = val; + best_node = n; + } + } + + set_bit(best_node, used_nodes); + return best_node; +} + +/** + * sched_domain_node_span - get a cpumask for a node's sched_domain + * @node: node whose cpumask we're constructing + * @size: number of nodes to include in this span + * + * Given a node, construct a good cpumask for its sched_domain to span. It + * should be one that prevents unnecessary balancing, but also spreads tasks + * out optimally. + */ +static cpumask_t __devinit sched_domain_node_span(int node) +{ + int i; + cpumask_t span, nodemask; + DECLARE_BITMAP(used_nodes, MAX_NUMNODES); + + cpus_clear(span); + bitmap_zero(used_nodes, MAX_NUMNODES); + + nodemask = node_to_cpumask(node); + cpus_or(span, span, nodemask); + set_bit(node, used_nodes); + + for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { + int next_node = find_next_best_node(node, used_nodes); + nodemask = node_to_cpumask(next_node); + cpus_or(span, span, nodemask); + } + + return span; +} +#endif + +/* + * At the moment, CONFIG_SCHED_SMT is never defined, but leave it in so we + * can switch it on easily if needed. + */ +#ifdef CONFIG_SCHED_SMT +static DEFINE_PER_CPU(struct sched_domain, cpu_domains); +static struct sched_group sched_group_cpus[NR_CPUS]; +static int __devinit cpu_to_cpu_group(int cpu) +{ + return cpu; +} +#endif + +static DEFINE_PER_CPU(struct sched_domain, phys_domains); +static struct sched_group sched_group_phys[NR_CPUS]; +static int __devinit cpu_to_phys_group(int cpu) +{ +#ifdef CONFIG_SCHED_SMT + return first_cpu(cpu_sibling_map[cpu]); +#else + return cpu; +#endif +} + +#ifdef CONFIG_NUMA +/* + * The init_sched_build_groups can't handle what we want to do with node + * groups, so roll our own. Now each node has its own list of groups which + * gets dynamically allocated. + */ +static DEFINE_PER_CPU(struct sched_domain, node_domains); +static struct sched_group *sched_group_nodes[MAX_NUMNODES]; + +static DEFINE_PER_CPU(struct sched_domain, allnodes_domains); +static struct sched_group sched_group_allnodes[MAX_NUMNODES]; + +static int __devinit cpu_to_allnodes_group(int cpu) +{ + return cpu_to_node(cpu); +} +#endif + +/* + * Set up scheduler domains and groups. Callers must hold the hotplug lock. + */ +void __devinit arch_init_sched_domains(void) +{ + int i; + cpumask_t cpu_default_map; + + /* + * Setup mask for cpus without special case scheduling requirements. + * For now this just excludes isolated cpus, but could be used to + * exclude other special cases in the future. + */ + cpus_complement(cpu_default_map, cpu_isolated_map); + cpus_and(cpu_default_map, cpu_default_map, cpu_online_map); + + /* + * Set up domains. Isolated domains just stay on the dummy domain. + */ + for_each_cpu_mask(i, cpu_default_map) { + int group; + struct sched_domain *sd = NULL, *p; + cpumask_t nodemask = node_to_cpumask(cpu_to_node(i)); + + cpus_and(nodemask, nodemask, cpu_default_map); + +#ifdef CONFIG_NUMA + if (num_online_cpus() + > SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) { + sd = &per_cpu(allnodes_domains, i); + *sd = SD_ALLNODES_INIT; + sd->span = cpu_default_map; + group = cpu_to_allnodes_group(i); + sd->groups = &sched_group_allnodes[group]; + p = sd; + } else + p = NULL; + + sd = &per_cpu(node_domains, i); + *sd = SD_NODE_INIT; + sd->span = sched_domain_node_span(cpu_to_node(i)); + sd->parent = p; + cpus_and(sd->span, sd->span, cpu_default_map); +#endif + + p = sd; + sd = &per_cpu(phys_domains, i); + group = cpu_to_phys_group(i); + *sd = SD_CPU_INIT; + sd->span = nodemask; + sd->parent = p; + sd->groups = &sched_group_phys[group]; + +#ifdef CONFIG_SCHED_SMT + p = sd; + sd = &per_cpu(cpu_domains, i); + group = cpu_to_cpu_group(i); + *sd = SD_SIBLING_INIT; + sd->span = cpu_sibling_map[i]; + cpus_and(sd->span, sd->span, cpu_default_map); + sd->parent = p; + sd->groups = &sched_group_cpus[group]; +#endif + } + +#ifdef CONFIG_SCHED_SMT + /* Set up CPU (sibling) groups */ + for_each_cpu_mask(i, cpu_default_map) { + cpumask_t this_sibling_map = cpu_sibling_map[i]; + cpus_and(this_sibling_map, this_sibling_map, cpu_default_map); + if (i != first_cpu(this_sibling_map)) + continue; + + init_sched_build_groups(sched_group_cpus, this_sibling_map, + &cpu_to_cpu_group); + } +#endif + + /* Set up physical groups */ + for (i = 0; i < MAX_NUMNODES; i++) { + cpumask_t nodemask = node_to_cpumask(i); + + cpus_and(nodemask, nodemask, cpu_default_map); + if (cpus_empty(nodemask)) + continue; + + init_sched_build_groups(sched_group_phys, nodemask, + &cpu_to_phys_group); + } + +#ifdef CONFIG_NUMA + init_sched_build_groups(sched_group_allnodes, cpu_default_map, + &cpu_to_allnodes_group); + + for (i = 0; i < MAX_NUMNODES; i++) { + /* Set up node groups */ + struct sched_group *sg, *prev; + cpumask_t nodemask = node_to_cpumask(i); + cpumask_t domainspan; + cpumask_t covered = CPU_MASK_NONE; + int j; + + cpus_and(nodemask, nodemask, cpu_default_map); + if (cpus_empty(nodemask)) + continue; + + domainspan = sched_domain_node_span(i); + cpus_and(domainspan, domainspan, cpu_default_map); + + sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL); + sched_group_nodes[i] = sg; + for_each_cpu_mask(j, nodemask) { + struct sched_domain *sd; + sd = &per_cpu(node_domains, j); + sd->groups = sg; + if (sd->groups == NULL) { + /* Turn off balancing if we have no groups */ + sd->flags = 0; + } + } + if (!sg) { + printk(KERN_WARNING + "Can not alloc domain group for node %d\n", i); + continue; + } + sg->cpu_power = 0; + sg->cpumask = nodemask; + cpus_or(covered, covered, nodemask); + prev = sg; + + for (j = 0; j < MAX_NUMNODES; j++) { + cpumask_t tmp, notcovered; + int n = (i + j) % MAX_NUMNODES; + + cpus_complement(notcovered, covered); + cpus_and(tmp, notcovered, cpu_default_map); + cpus_and(tmp, tmp, domainspan); + if (cpus_empty(tmp)) + break; + + nodemask = node_to_cpumask(n); + cpus_and(tmp, tmp, nodemask); + if (cpus_empty(tmp)) + continue; + + sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL); + if (!sg) { + printk(KERN_WARNING + "Can not alloc domain group for node %d\n", j); + break; + } + sg->cpu_power = 0; + sg->cpumask = tmp; + cpus_or(covered, covered, tmp); + prev->next = sg; + prev = sg; + } + prev->next = sched_group_nodes[i]; + } +#endif + + /* Calculate CPU power for physical packages and nodes */ + for_each_cpu_mask(i, cpu_default_map) { + int power; + struct sched_domain *sd; +#ifdef CONFIG_SCHED_SMT + sd = &per_cpu(cpu_domains, i); + power = SCHED_LOAD_SCALE; + sd->groups->cpu_power = power; +#endif + + sd = &per_cpu(phys_domains, i); + power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE * + (cpus_weight(sd->groups->cpumask)-1) / 10; + sd->groups->cpu_power = power; + +#ifdef CONFIG_NUMA + sd = &per_cpu(allnodes_domains, i); + if (sd->groups) { + power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE * + (cpus_weight(sd->groups->cpumask)-1) / 10; + sd->groups->cpu_power = power; + } +#endif + } + +#ifdef CONFIG_NUMA + for (i = 0; i < MAX_NUMNODES; i++) { + struct sched_group *sg = sched_group_nodes[i]; + int j; + + if (sg == NULL) + continue; +next_sg: + for_each_cpu_mask(j, sg->cpumask) { + struct sched_domain *sd; + int power; + + sd = &per_cpu(phys_domains, j); + if (j != first_cpu(sd->groups->cpumask)) { + /* + * Only add "power" once for each + * physical package. + */ + continue; + } + power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE * + (cpus_weight(sd->groups->cpumask)-1) / 10; + + sg->cpu_power += power; + } + sg = sg->next; + if (sg != sched_group_nodes[i]) + goto next_sg; + } +#endif + + /* Attach the domains */ + for_each_online_cpu(i) { + struct sched_domain *sd; +#ifdef CONFIG_SCHED_SMT + sd = &per_cpu(cpu_domains, i); +#else + sd = &per_cpu(phys_domains, i); +#endif + cpu_attach_domain(sd, i); + } +} + +void __devinit arch_destroy_sched_domains(void) +{ +#ifdef CONFIG_NUMA + int i; + for (i = 0; i < MAX_NUMNODES; i++) { + struct sched_group *oldsg, *sg = sched_group_nodes[i]; + if (sg == NULL) + continue; + sg = sg->next; +next_sg: + oldsg = sg; + sg = sg->next; + kfree(oldsg); + if (oldsg != sched_group_nodes[i]) + goto next_sg; + sched_group_nodes[i] = NULL; + } +#endif +} + diff --git a/arch/ia64/kernel/efi.c b/arch/ia64/kernel/efi.c new file mode 100644 index 000000000000..4a3b1aac43e7 --- /dev/null +++ b/arch/ia64/kernel/efi.c @@ -0,0 +1,832 @@ +/* + * Extensible Firmware Interface + * + * Based on Extensible Firmware Interface Specification version 0.9 April 30, 1999 + * + * Copyright (C) 1999 VA Linux Systems + * Copyright (C) 1999 Walt Drummond <drummond@valinux.com> + * Copyright (C) 1999-2003 Hewlett-Packard Co. + * David Mosberger-Tang <davidm@hpl.hp.com> + * Stephane Eranian <eranian@hpl.hp.com> + * + * All EFI Runtime Services are not implemented yet as EFI only + * supports physical mode addressing on SoftSDV. This is to be fixed + * in a future version. --drummond 1999-07-20 + * + * Implemented EFI runtime services and virtual mode calls. --davidm + * + * Goutham Rao: <goutham.rao@intel.com> + * Skip non-WB memory and ignore empty memory ranges. + */ +#include <linux/config.h> +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/types.h> +#include <linux/time.h> +#include <linux/efi.h> + +#include <asm/io.h> +#include <asm/kregs.h> +#include <asm/meminit.h> +#include <asm/pgtable.h> +#include <asm/processor.h> +#include <asm/mca.h> + +#define EFI_DEBUG 0 + +extern efi_status_t efi_call_phys (void *, ...); + +struct efi efi; +EXPORT_SYMBOL(efi); +static efi_runtime_services_t *runtime; +static unsigned long mem_limit = ~0UL, max_addr = ~0UL; + +#define efi_call_virt(f, args...) (*(f))(args) + +#define STUB_GET_TIME(prefix, adjust_arg) \ +static efi_status_t \ +prefix##_get_time (efi_time_t *tm, efi_time_cap_t *tc) \ +{ \ + struct ia64_fpreg fr[6]; \ + efi_time_cap_t *atc = NULL; \ + efi_status_t ret; \ + \ + if (tc) \ + atc = adjust_arg(tc); \ + ia64_save_scratch_fpregs(fr); \ + ret = efi_call_##prefix((efi_get_time_t *) __va(runtime->get_time), adjust_arg(tm), atc); \ + ia64_load_scratch_fpregs(fr); \ + return ret; \ +} + +#define STUB_SET_TIME(prefix, adjust_arg) \ +static efi_status_t \ +prefix##_set_time (efi_time_t *tm) \ +{ \ + struct ia64_fpreg fr[6]; \ + efi_status_t ret; \ + \ + ia64_save_scratch_fpregs(fr); \ + ret = efi_call_##prefix((efi_set_time_t *) __va(runtime->set_time), adjust_arg(tm)); \ + ia64_load_scratch_fpregs(fr); \ + return ret; \ +} + +#define STUB_GET_WAKEUP_TIME(prefix, adjust_arg) \ +static efi_status_t \ +prefix##_get_wakeup_time (efi_bool_t *enabled, efi_bool_t *pending, efi_time_t *tm) \ +{ \ + struct ia64_fpreg fr[6]; \ + efi_status_t ret; \ + \ + ia64_save_scratch_fpregs(fr); \ + ret = efi_call_##prefix((efi_get_wakeup_time_t *) __va(runtime->get_wakeup_time), \ + adjust_arg(enabled), adjust_arg(pending), adjust_arg(tm)); \ + ia64_load_scratch_fpregs(fr); \ + return ret; \ +} + +#define STUB_SET_WAKEUP_TIME(prefix, adjust_arg) \ +static efi_status_t \ +prefix##_set_wakeup_time (efi_bool_t enabled, efi_time_t *tm) \ +{ \ + struct ia64_fpreg fr[6]; \ + efi_time_t *atm = NULL; \ + efi_status_t ret; \ + \ + if (tm) \ + atm = adjust_arg(tm); \ + ia64_save_scratch_fpregs(fr); \ + ret = efi_call_##prefix((efi_set_wakeup_time_t *) __va(runtime->set_wakeup_time), \ + enabled, atm); \ + ia64_load_scratch_fpregs(fr); \ + return ret; \ +} + +#define STUB_GET_VARIABLE(prefix, adjust_arg) \ +static efi_status_t \ +prefix##_get_variable (efi_char16_t *name, efi_guid_t *vendor, u32 *attr, \ + unsigned long *data_size, void *data) \ +{ \ + struct ia64_fpreg fr[6]; \ + u32 *aattr = NULL; \ + efi_status_t ret; \ + \ + if (attr) \ + aattr = adjust_arg(attr); \ + ia64_save_scratch_fpregs(fr); \ + ret = efi_call_##prefix((efi_get_variable_t *) __va(runtime->get_variable), \ + adjust_arg(name), adjust_arg(vendor), aattr, \ + adjust_arg(data_size), adjust_arg(data)); \ + ia64_load_scratch_fpregs(fr); \ + return ret; \ +} + +#define STUB_GET_NEXT_VARIABLE(prefix, adjust_arg) \ +static efi_status_t \ +prefix##_get_next_variable (unsigned long *name_size, efi_char16_t *name, efi_guid_t *vendor) \ +{ \ + struct ia64_fpreg fr[6]; \ + efi_status_t ret; \ + \ + ia64_save_scratch_fpregs(fr); \ + ret = efi_call_##prefix((efi_get_next_variable_t *) __va(runtime->get_next_variable), \ + adjust_arg(name_size), adjust_arg(name), adjust_arg(vendor)); \ + ia64_load_scratch_fpregs(fr); \ + return ret; \ +} + +#define STUB_SET_VARIABLE(prefix, adjust_arg) \ +static efi_status_t \ +prefix##_set_variable (efi_char16_t *name, efi_guid_t *vendor, unsigned long attr, \ + unsigned long data_size, void *data) \ +{ \ + struct ia64_fpreg fr[6]; \ + efi_status_t ret; \ + \ + ia64_save_scratch_fpregs(fr); \ + ret = efi_call_##prefix((efi_set_variable_t *) __va(runtime->set_variable), \ + adjust_arg(name), adjust_arg(vendor), attr, data_size, \ + adjust_arg(data)); \ + ia64_load_scratch_fpregs(fr); \ + return ret; \ +} + +#define STUB_GET_NEXT_HIGH_MONO_COUNT(prefix, adjust_arg) \ +static efi_status_t \ +prefix##_get_next_high_mono_count (u32 *count) \ +{ \ + struct ia64_fpreg fr[6]; \ + efi_status_t ret; \ + \ + ia64_save_scratch_fpregs(fr); \ + ret = efi_call_##prefix((efi_get_next_high_mono_count_t *) \ + __va(runtime->get_next_high_mono_count), adjust_arg(count)); \ + ia64_load_scratch_fpregs(fr); \ + return ret; \ +} + +#define STUB_RESET_SYSTEM(prefix, adjust_arg) \ +static void \ +prefix##_reset_system (int reset_type, efi_status_t status, \ + unsigned long data_size, efi_char16_t *data) \ +{ \ + struct ia64_fpreg fr[6]; \ + efi_char16_t *adata = NULL; \ + \ + if (data) \ + adata = adjust_arg(data); \ + \ + ia64_save_scratch_fpregs(fr); \ + efi_call_##prefix((efi_reset_system_t *) __va(runtime->reset_system), \ + reset_type, status, data_size, adata); \ + /* should not return, but just in case... */ \ + ia64_load_scratch_fpregs(fr); \ +} + +#define phys_ptr(arg) ((__typeof__(arg)) ia64_tpa(arg)) + +STUB_GET_TIME(phys, phys_ptr) +STUB_SET_TIME(phys, phys_ptr) +STUB_GET_WAKEUP_TIME(phys, phys_ptr) +STUB_SET_WAKEUP_TIME(phys, phys_ptr) +STUB_GET_VARIABLE(phys, phys_ptr) +STUB_GET_NEXT_VARIABLE(phys, phys_ptr) +STUB_SET_VARIABLE(phys, phys_ptr) +STUB_GET_NEXT_HIGH_MONO_COUNT(phys, phys_ptr) +STUB_RESET_SYSTEM(phys, phys_ptr) + +#define id(arg) arg + +STUB_GET_TIME(virt, id) +STUB_SET_TIME(virt, id) +STUB_GET_WAKEUP_TIME(virt, id) +STUB_SET_WAKEUP_TIME(virt, id) +STUB_GET_VARIABLE(virt, id) +STUB_GET_NEXT_VARIABLE(virt, id) +STUB_SET_VARIABLE(virt, id) +STUB_GET_NEXT_HIGH_MONO_COUNT(virt, id) +STUB_RESET_SYSTEM(virt, id) + +void +efi_gettimeofday (struct timespec *ts) +{ + efi_time_t tm; + + memset(ts, 0, sizeof(ts)); + if ((*efi.get_time)(&tm, NULL) != EFI_SUCCESS) + return; + + ts->tv_sec = mktime(tm.year, tm.month, tm.day, tm.hour, tm.minute, tm.second); + ts->tv_nsec = tm.nanosecond; +} + +static int +is_available_memory (efi_memory_desc_t *md) +{ + if (!(md->attribute & EFI_MEMORY_WB)) + return 0; + + switch (md->type) { + case EFI_LOADER_CODE: + case EFI_LOADER_DATA: + case EFI_BOOT_SERVICES_CODE: + case EFI_BOOT_SERVICES_DATA: + case EFI_CONVENTIONAL_MEMORY: + return 1; + } + return 0; +} + +/* + * Trim descriptor MD so its starts at address START_ADDR. If the descriptor covers + * memory that is normally available to the kernel, issue a warning that some memory + * is being ignored. + */ +static void +trim_bottom (efi_memory_desc_t *md, u64 start_addr) +{ + u64 num_skipped_pages; + + if (md->phys_addr >= start_addr || !md->num_pages) + return; + + num_skipped_pages = (start_addr - md->phys_addr) >> EFI_PAGE_SHIFT; + if (num_skipped_pages > md->num_pages) + num_skipped_pages = md->num_pages; + + if (is_available_memory(md)) + printk(KERN_NOTICE "efi.%s: ignoring %luKB of memory at 0x%lx due to granule hole " + "at 0x%lx\n", __FUNCTION__, + (num_skipped_pages << EFI_PAGE_SHIFT) >> 10, + md->phys_addr, start_addr - IA64_GRANULE_SIZE); + /* + * NOTE: Don't set md->phys_addr to START_ADDR because that could cause the memory + * descriptor list to become unsorted. In such a case, md->num_pages will be + * zero, so the Right Thing will happen. + */ + md->phys_addr += num_skipped_pages << EFI_PAGE_SHIFT; + md->num_pages -= num_skipped_pages; +} + +static void +trim_top (efi_memory_desc_t *md, u64 end_addr) +{ + u64 num_dropped_pages, md_end_addr; + + md_end_addr = md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT); + + if (md_end_addr <= end_addr || !md->num_pages) + return; + + num_dropped_pages = (md_end_addr - end_addr) >> EFI_PAGE_SHIFT; + if (num_dropped_pages > md->num_pages) + num_dropped_pages = md->num_pages; + + if (is_available_memory(md)) + printk(KERN_NOTICE "efi.%s: ignoring %luKB of memory at 0x%lx due to granule hole " + "at 0x%lx\n", __FUNCTION__, + (num_dropped_pages << EFI_PAGE_SHIFT) >> 10, + md->phys_addr, end_addr); + md->num_pages -= num_dropped_pages; +} + +/* + * Walks the EFI memory map and calls CALLBACK once for each EFI memory descriptor that + * has memory that is available for OS use. + */ +void +efi_memmap_walk (efi_freemem_callback_t callback, void *arg) +{ + int prev_valid = 0; + struct range { + u64 start; + u64 end; + } prev, curr; + void *efi_map_start, *efi_map_end, *p, *q; + efi_memory_desc_t *md, *check_md; + u64 efi_desc_size, start, end, granule_addr, last_granule_addr, first_non_wb_addr = 0; + unsigned long total_mem = 0; + + efi_map_start = __va(ia64_boot_param->efi_memmap); + efi_map_end = efi_map_start + ia64_boot_param->efi_memmap_size; + efi_desc_size = ia64_boot_param->efi_memdesc_size; + + for (p = efi_map_start; p < efi_map_end; p += efi_desc_size) { + md = p; + + /* skip over non-WB memory descriptors; that's all we're interested in... */ + if (!(md->attribute & EFI_MEMORY_WB)) + continue; + + /* + * granule_addr is the base of md's first granule. + * [granule_addr - first_non_wb_addr) is guaranteed to + * be contiguous WB memory. + */ + granule_addr = GRANULEROUNDDOWN(md->phys_addr); + first_non_wb_addr = max(first_non_wb_addr, granule_addr); + + if (first_non_wb_addr < md->phys_addr) { + trim_bottom(md, granule_addr + IA64_GRANULE_SIZE); + granule_addr = GRANULEROUNDDOWN(md->phys_addr); + first_non_wb_addr = max(first_non_wb_addr, granule_addr); + } + + for (q = p; q < efi_map_end; q += efi_desc_size) { + check_md = q; + + if ((check_md->attribute & EFI_MEMORY_WB) && + (check_md->phys_addr == first_non_wb_addr)) + first_non_wb_addr += check_md->num_pages << EFI_PAGE_SHIFT; + else + break; /* non-WB or hole */ + } + + last_granule_addr = GRANULEROUNDDOWN(first_non_wb_addr); + if (last_granule_addr < md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT)) + trim_top(md, last_granule_addr); + + if (is_available_memory(md)) { + if (md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT) >= max_addr) { + if (md->phys_addr >= max_addr) + continue; + md->num_pages = (max_addr - md->phys_addr) >> EFI_PAGE_SHIFT; + first_non_wb_addr = max_addr; + } + + if (total_mem >= mem_limit) + continue; + + if (total_mem + (md->num_pages << EFI_PAGE_SHIFT) > mem_limit) { + unsigned long limit_addr = md->phys_addr; + + limit_addr += mem_limit - total_mem; + limit_addr = GRANULEROUNDDOWN(limit_addr); + + if (md->phys_addr > limit_addr) + continue; + + md->num_pages = (limit_addr - md->phys_addr) >> + EFI_PAGE_SHIFT; + first_non_wb_addr = max_addr = md->phys_addr + + (md->num_pages << EFI_PAGE_SHIFT); + } + total_mem += (md->num_pages << EFI_PAGE_SHIFT); + + if (md->num_pages == 0) + continue; + + curr.start = PAGE_OFFSET + md->phys_addr; + curr.end = curr.start + (md->num_pages << EFI_PAGE_SHIFT); + + if (!prev_valid) { + prev = curr; + prev_valid = 1; + } else { + if (curr.start < prev.start) + printk(KERN_ERR "Oops: EFI memory table not ordered!\n"); + + if (prev.end == curr.start) { + /* merge two consecutive memory ranges */ + prev.end = curr.end; + } else { + start = PAGE_ALIGN(prev.start); + end = prev.end & PAGE_MASK; + if ((end > start) && (*callback)(start, end, arg) < 0) + return; + prev = curr; + } + } + } + } + if (prev_valid) { + start = PAGE_ALIGN(prev.start); + end = prev.end & PAGE_MASK; + if (end > start) + (*callback)(start, end, arg); + } +} + +/* + * Look for the PAL_CODE region reported by EFI and maps it using an + * ITR to enable safe PAL calls in virtual mode. See IA-64 Processor + * Abstraction Layer chapter 11 in ADAG + */ + +void * +efi_get_pal_addr (void) +{ + void *efi_map_start, *efi_map_end, *p; + efi_memory_desc_t *md; + u64 efi_desc_size; + int pal_code_count = 0; + u64 vaddr, mask; + + efi_map_start = __va(ia64_boot_param->efi_memmap); + efi_map_end = efi_map_start + ia64_boot_param->efi_memmap_size; + efi_desc_size = ia64_boot_param->efi_memdesc_size; + + for (p = efi_map_start; p < efi_map_end; p += efi_desc_size) { + md = p; + if (md->type != EFI_PAL_CODE) + continue; + + if (++pal_code_count > 1) { + printk(KERN_ERR "Too many EFI Pal Code memory ranges, dropped @ %lx\n", + md->phys_addr); + continue; + } + /* + * The only ITLB entry in region 7 that is used is the one installed by + * __start(). That entry covers a 64MB range. + */ + mask = ~((1 << KERNEL_TR_PAGE_SHIFT) - 1); + vaddr = PAGE_OFFSET + md->phys_addr; + + /* + * We must check that the PAL mapping won't overlap with the kernel + * mapping. + * + * PAL code is guaranteed to be aligned on a power of 2 between 4k and + * 256KB and that only one ITR is needed to map it. This implies that the + * PAL code is always aligned on its size, i.e., the closest matching page + * size supported by the TLB. Therefore PAL code is guaranteed never to + * cross a 64MB unless it is bigger than 64MB (very unlikely!). So for + * now the following test is enough to determine whether or not we need a + * dedicated ITR for the PAL code. + */ + if ((vaddr & mask) == (KERNEL_START & mask)) { + printk(KERN_INFO "%s: no need to install ITR for PAL code\n", + __FUNCTION__); + continue; + } + + if (md->num_pages << EFI_PAGE_SHIFT > IA64_GRANULE_SIZE) + panic("Woah! PAL code size bigger than a granule!"); + +#if EFI_DEBUG + mask = ~((1 << IA64_GRANULE_SHIFT) - 1); + + printk(KERN_INFO "CPU %d: mapping PAL code [0x%lx-0x%lx) into [0x%lx-0x%lx)\n", + smp_processor_id(), md->phys_addr, + md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT), + vaddr & mask, (vaddr & mask) + IA64_GRANULE_SIZE); +#endif + return __va(md->phys_addr); + } + printk(KERN_WARNING "%s: no PAL-code memory-descriptor found", + __FUNCTION__); + return NULL; +} + +void +efi_map_pal_code (void) +{ + void *pal_vaddr = efi_get_pal_addr (); + u64 psr; + + if (!pal_vaddr) + return; + + /* + * Cannot write to CRx with PSR.ic=1 + */ + psr = ia64_clear_ic(); + ia64_itr(0x1, IA64_TR_PALCODE, GRANULEROUNDDOWN((unsigned long) pal_vaddr), + pte_val(pfn_pte(__pa(pal_vaddr) >> PAGE_SHIFT, PAGE_KERNEL)), + IA64_GRANULE_SHIFT); + ia64_set_psr(psr); /* restore psr */ + ia64_srlz_i(); +} + +void __init +efi_init (void) +{ + void *efi_map_start, *efi_map_end; + efi_config_table_t *config_tables; + efi_char16_t *c16; + u64 efi_desc_size; + char *cp, *end, vendor[100] = "unknown"; + extern char saved_command_line[]; + int i; + + /* it's too early to be able to use the standard kernel command line support... */ + for (cp = saved_command_line; *cp; ) { + if (memcmp(cp, "mem=", 4) == 0) { + cp += 4; + mem_limit = memparse(cp, &end); + if (end != cp) + break; + cp = end; + } else if (memcmp(cp, "max_addr=", 9) == 0) { + cp += 9; + max_addr = GRANULEROUNDDOWN(memparse(cp, &end)); + if (end != cp) + break; + cp = end; + } else { + while (*cp != ' ' && *cp) + ++cp; + while (*cp == ' ') + ++cp; + } + } + if (max_addr != ~0UL) + printk(KERN_INFO "Ignoring memory above %luMB\n", max_addr >> 20); + + efi.systab = __va(ia64_boot_param->efi_systab); + + /* + * Verify the EFI Table + */ + if (efi.systab == NULL) + panic("Woah! Can't find EFI system table.\n"); + if (efi.systab->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE) + panic("Woah! EFI system table signature incorrect\n"); + if ((efi.systab->hdr.revision ^ EFI_SYSTEM_TABLE_REVISION) >> 16 != 0) + printk(KERN_WARNING "Warning: EFI system table major version mismatch: " + "got %d.%02d, expected %d.%02d\n", + efi.systab->hdr.revision >> 16, efi.systab->hdr.revision & 0xffff, + EFI_SYSTEM_TABLE_REVISION >> 16, EFI_SYSTEM_TABLE_REVISION & 0xffff); + + config_tables = __va(efi.systab->tables); + + /* Show what we know for posterity */ + c16 = __va(efi.systab->fw_vendor); + if (c16) { + for (i = 0;i < (int) sizeof(vendor) && *c16; ++i) + vendor[i] = *c16++; + vendor[i] = '\0'; + } + + printk(KERN_INFO "EFI v%u.%.02u by %s:", + efi.systab->hdr.revision >> 16, efi.systab->hdr.revision & 0xffff, vendor); + + for (i = 0; i < (int) efi.systab->nr_tables; i++) { + if (efi_guidcmp(config_tables[i].guid, MPS_TABLE_GUID) == 0) { + efi.mps = __va(config_tables[i].table); + printk(" MPS=0x%lx", config_tables[i].table); + } else if (efi_guidcmp(config_tables[i].guid, ACPI_20_TABLE_GUID) == 0) { + efi.acpi20 = __va(config_tables[i].table); + printk(" ACPI 2.0=0x%lx", config_tables[i].table); + } else if (efi_guidcmp(config_tables[i].guid, ACPI_TABLE_GUID) == 0) { + efi.acpi = __va(config_tables[i].table); + printk(" ACPI=0x%lx", config_tables[i].table); + } else if (efi_guidcmp(config_tables[i].guid, SMBIOS_TABLE_GUID) == 0) { + efi.smbios = __va(config_tables[i].table); + printk(" SMBIOS=0x%lx", config_tables[i].table); + } else if (efi_guidcmp(config_tables[i].guid, SAL_SYSTEM_TABLE_GUID) == 0) { + efi.sal_systab = __va(config_tables[i].table); + printk(" SALsystab=0x%lx", config_tables[i].table); + } else if (efi_guidcmp(config_tables[i].guid, HCDP_TABLE_GUID) == 0) { + efi.hcdp = __va(config_tables[i].table); + printk(" HCDP=0x%lx", config_tables[i].table); + } + } + printk("\n"); + + runtime = __va(efi.systab->runtime); + efi.get_time = phys_get_time; + efi.set_time = phys_set_time; + efi.get_wakeup_time = phys_get_wakeup_time; + efi.set_wakeup_time = phys_set_wakeup_time; + efi.get_variable = phys_get_variable; + efi.get_next_variable = phys_get_next_variable; + efi.set_variable = phys_set_variable; + efi.get_next_high_mono_count = phys_get_next_high_mono_count; + efi.reset_system = phys_reset_system; + + efi_map_start = __va(ia64_boot_param->efi_memmap); + efi_map_end = efi_map_start + ia64_boot_param->efi_memmap_size; + efi_desc_size = ia64_boot_param->efi_memdesc_size; + +#if EFI_DEBUG + /* print EFI memory map: */ + { + efi_memory_desc_t *md; + void *p; + + for (i = 0, p = efi_map_start; p < efi_map_end; ++i, p += efi_desc_size) { + md = p; + printk("mem%02u: type=%u, attr=0x%lx, range=[0x%016lx-0x%016lx) (%luMB)\n", + i, md->type, md->attribute, md->phys_addr, + md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT), + md->num_pages >> (20 - EFI_PAGE_SHIFT)); + } + } +#endif + + efi_map_pal_code(); + efi_enter_virtual_mode(); +} + +void +efi_enter_virtual_mode (void) +{ + void *efi_map_start, *efi_map_end, *p; + efi_memory_desc_t *md; + efi_status_t status; + u64 efi_desc_size; + + efi_map_start = __va(ia64_boot_param->efi_memmap); + efi_map_end = efi_map_start + ia64_boot_param->efi_memmap_size; + efi_desc_size = ia64_boot_param->efi_memdesc_size; + + for (p = efi_map_start; p < efi_map_end; p += efi_desc_size) { + md = p; + if (md->attribute & EFI_MEMORY_RUNTIME) { + /* + * Some descriptors have multiple bits set, so the order of + * the tests is relevant. + */ + if (md->attribute & EFI_MEMORY_WB) { + md->virt_addr = (u64) __va(md->phys_addr); + } else if (md->attribute & EFI_MEMORY_UC) { + md->virt_addr = (u64) ioremap(md->phys_addr, 0); + } else if (md->attribute & EFI_MEMORY_WC) { +#if 0 + md->virt_addr = ia64_remap(md->phys_addr, (_PAGE_A | _PAGE_P + | _PAGE_D + | _PAGE_MA_WC + | _PAGE_PL_0 + | _PAGE_AR_RW)); +#else + printk(KERN_INFO "EFI_MEMORY_WC mapping\n"); + md->virt_addr = (u64) ioremap(md->phys_addr, 0); +#endif + } else if (md->attribute & EFI_MEMORY_WT) { +#if 0 + md->virt_addr = ia64_remap(md->phys_addr, (_PAGE_A | _PAGE_P + | _PAGE_D | _PAGE_MA_WT + | _PAGE_PL_0 + | _PAGE_AR_RW)); +#else + printk(KERN_INFO "EFI_MEMORY_WT mapping\n"); + md->virt_addr = (u64) ioremap(md->phys_addr, 0); +#endif + } + } + } + + status = efi_call_phys(__va(runtime->set_virtual_address_map), + ia64_boot_param->efi_memmap_size, + efi_desc_size, ia64_boot_param->efi_memdesc_version, + ia64_boot_param->efi_memmap); + if (status != EFI_SUCCESS) { + printk(KERN_WARNING "warning: unable to switch EFI into virtual mode " + "(status=%lu)\n", status); + return; + } + + /* + * Now that EFI is in virtual mode, we call the EFI functions more efficiently: + */ + efi.get_time = virt_get_time; + efi.set_time = virt_set_time; + efi.get_wakeup_time = virt_get_wakeup_time; + efi.set_wakeup_time = virt_set_wakeup_time; + efi.get_variable = virt_get_variable; + efi.get_next_variable = virt_get_next_variable; + efi.set_variable = virt_set_variable; + efi.get_next_high_mono_count = virt_get_next_high_mono_count; + efi.reset_system = virt_reset_system; +} + +/* + * Walk the EFI memory map looking for the I/O port range. There can only be one entry of + * this type, other I/O port ranges should be described via ACPI. + */ +u64 +efi_get_iobase (void) +{ + void *efi_map_start, *efi_map_end, *p; + efi_memory_desc_t *md; + u64 efi_desc_size; + + efi_map_start = __va(ia64_boot_param->efi_memmap); + efi_map_end = efi_map_start + ia64_boot_param->efi_memmap_size; + efi_desc_size = ia64_boot_param->efi_memdesc_size; + + for (p = efi_map_start; p < efi_map_end; p += efi_desc_size) { + md = p; + if (md->type == EFI_MEMORY_MAPPED_IO_PORT_SPACE) { + if (md->attribute & EFI_MEMORY_UC) + return md->phys_addr; + } + } + return 0; +} + +u32 +efi_mem_type (unsigned long phys_addr) +{ + void *efi_map_start, *efi_map_end, *p; + efi_memory_desc_t *md; + u64 efi_desc_size; + + efi_map_start = __va(ia64_boot_param->efi_memmap); + efi_map_end = efi_map_start + ia64_boot_param->efi_memmap_size; + efi_desc_size = ia64_boot_param->efi_memdesc_size; + + for (p = efi_map_start; p < efi_map_end; p += efi_desc_size) { + md = p; + + if (phys_addr - md->phys_addr < (md->num_pages << EFI_PAGE_SHIFT)) + return md->type; + } + return 0; +} + +u64 +efi_mem_attributes (unsigned long phys_addr) +{ + void *efi_map_start, *efi_map_end, *p; + efi_memory_desc_t *md; + u64 efi_desc_size; + + efi_map_start = __va(ia64_boot_param->efi_memmap); + efi_map_end = efi_map_start + ia64_boot_param->efi_memmap_size; + efi_desc_size = ia64_boot_param->efi_memdesc_size; + + for (p = efi_map_start; p < efi_map_end; p += efi_desc_size) { + md = p; + + if (phys_addr - md->phys_addr < (md->num_pages << EFI_PAGE_SHIFT)) + return md->attribute; + } + return 0; +} +EXPORT_SYMBOL(efi_mem_attributes); + +int +valid_phys_addr_range (unsigned long phys_addr, unsigned long *size) +{ + void *efi_map_start, *efi_map_end, *p; + efi_memory_desc_t *md; + u64 efi_desc_size; + + efi_map_start = __va(ia64_boot_param->efi_memmap); + efi_map_end = efi_map_start + ia64_boot_param->efi_memmap_size; + efi_desc_size = ia64_boot_param->efi_memdesc_size; + + for (p = efi_map_start; p < efi_map_end; p += efi_desc_size) { + md = p; + + if (phys_addr - md->phys_addr < (md->num_pages << EFI_PAGE_SHIFT)) { + if (!(md->attribute & EFI_MEMORY_WB)) + return 0; + + if (*size > md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT) - phys_addr) + *size = md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT) - phys_addr; + return 1; + } + } + return 0; +} + +int __init +efi_uart_console_only(void) +{ + efi_status_t status; + char *s, name[] = "ConOut"; + efi_guid_t guid = EFI_GLOBAL_VARIABLE_GUID; + efi_char16_t *utf16, name_utf16[32]; + unsigned char data[1024]; + unsigned long size = sizeof(data); + struct efi_generic_dev_path *hdr, *end_addr; + int uart = 0; + + /* Convert to UTF-16 */ + utf16 = name_utf16; + s = name; + while (*s) + *utf16++ = *s++ & 0x7f; + *utf16 = 0; + + status = efi.get_variable(name_utf16, &guid, NULL, &size, data); + if (status != EFI_SUCCESS) { + printk(KERN_ERR "No EFI %s variable?\n", name); + return 0; + } + + hdr = (struct efi_generic_dev_path *) data; + end_addr = (struct efi_generic_dev_path *) ((u8 *) data + size); + while (hdr < end_addr) { + if (hdr->type == EFI_DEV_MSG && + hdr->sub_type == EFI_DEV_MSG_UART) + uart = 1; + else if (hdr->type == EFI_DEV_END_PATH || + hdr->type == EFI_DEV_END_PATH2) { + if (!uart) + return 0; + if (hdr->sub_type == EFI_DEV_END_ENTIRE) + return 1; + uart = 0; + } + hdr = (struct efi_generic_dev_path *) ((u8 *) hdr + hdr->length); + } + printk(KERN_ERR "Malformed %s value\n", name); + return 0; +} diff --git a/arch/ia64/kernel/efi_stub.S b/arch/ia64/kernel/efi_stub.S new file mode 100644 index 000000000000..5a7fe70212a9 --- /dev/null +++ b/arch/ia64/kernel/efi_stub.S @@ -0,0 +1,86 @@ +/* + * EFI call stub. + * + * Copyright (C) 1999-2001 Hewlett-Packard Co + * David Mosberger <davidm@hpl.hp.com> + * + * This stub allows us to make EFI calls in physical mode with interrupts + * turned off. We need this because we can't call SetVirtualMap() until + * the kernel has booted far enough to allow allocation of struct vma_struct + * entries (which we would need to map stuff with memory attributes other + * than uncached or writeback...). Since the GetTime() service gets called + * earlier than that, we need to be able to make physical mode EFI calls from + * the kernel. + */ + +/* + * PSR settings as per SAL spec (Chapter 8 in the "IA-64 System + * Abstraction Layer Specification", revision 2.6e). Note that + * psr.dfl and psr.dfh MUST be cleared, despite what this manual says. + * Otherwise, SAL dies whenever it's trying to do an IA-32 BIOS call + * (the br.ia instruction fails unless psr.dfl and psr.dfh are + * cleared). Fortunately, SAL promises not to touch the floating + * point regs, so at least we don't have to save f2-f127. + */ +#define PSR_BITS_TO_CLEAR \ + (IA64_PSR_I | IA64_PSR_IT | IA64_PSR_DT | IA64_PSR_RT | \ + IA64_PSR_DD | IA64_PSR_SS | IA64_PSR_RI | IA64_PSR_ED | \ + IA64_PSR_DFL | IA64_PSR_DFH) + +#define PSR_BITS_TO_SET \ + (IA64_PSR_BN) + +#include <asm/processor.h> +#include <asm/asmmacro.h> + +/* + * Inputs: + * in0 = address of function descriptor of EFI routine to call + * in1..in7 = arguments to routine + * + * Outputs: + * r8 = EFI_STATUS returned by called function + */ + +GLOBAL_ENTRY(efi_call_phys) + .prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(8) + alloc loc1=ar.pfs,8,7,7,0 + ld8 r2=[in0],8 // load EFI function's entry point + mov loc0=rp + .body + ;; + mov loc2=gp // save global pointer + mov loc4=ar.rsc // save RSE configuration + mov ar.rsc=0 // put RSE in enforced lazy, LE mode + ;; + ld8 gp=[in0] // load EFI function's global pointer + movl r16=PSR_BITS_TO_CLEAR + mov loc3=psr // save processor status word + movl r17=PSR_BITS_TO_SET + ;; + or loc3=loc3,r17 + mov b6=r2 + ;; + andcm r16=loc3,r16 // get psr with IT, DT, and RT bits cleared + br.call.sptk.many rp=ia64_switch_mode_phys +.ret0: mov out4=in5 + mov out0=in1 + mov out1=in2 + mov out2=in3 + mov out3=in4 + mov out5=in6 + mov out6=in7 + mov loc5=r19 + mov loc6=r20 + br.call.sptk.many rp=b6 // call the EFI function +.ret1: mov ar.rsc=0 // put RSE in enforced lazy, LE mode + mov r16=loc3 + mov r19=loc5 + mov r20=loc6 + br.call.sptk.many rp=ia64_switch_mode_virt // return to virtual mode +.ret2: mov ar.rsc=loc4 // restore RSE configuration + mov ar.pfs=loc1 + mov rp=loc0 + mov gp=loc2 + br.ret.sptk.many rp +END(efi_call_phys) diff --git a/arch/ia64/kernel/entry.S b/arch/ia64/kernel/entry.S new file mode 100644 index 000000000000..0272c010a3ba --- /dev/null +++ b/arch/ia64/kernel/entry.S @@ -0,0 +1,1587 @@ +/* + * ia64/kernel/entry.S + * + * Kernel entry points. + * + * Copyright (C) 1998-2003, 2005 Hewlett-Packard Co + * David Mosberger-Tang <davidm@hpl.hp.com> + * Copyright (C) 1999, 2002-2003 + * Asit Mallick <Asit.K.Mallick@intel.com> + * Don Dugger <Don.Dugger@intel.com> + * Suresh Siddha <suresh.b.siddha@intel.com> + * Fenghua Yu <fenghua.yu@intel.com> + * Copyright (C) 1999 VA Linux Systems + * Copyright (C) 1999 Walt Drummond <drummond@valinux.com> + */ +/* + * ia64_switch_to now places correct virtual mapping in in TR2 for + * kernel stack. This allows us to handle interrupts without changing + * to physical mode. + * + * Jonathan Nicklin <nicklin@missioncriticallinux.com> + * Patrick O'Rourke <orourke@missioncriticallinux.com> + * 11/07/2000 + */ +/* + * Global (preserved) predicate usage on syscall entry/exit path: + * + * pKStk: See entry.h. + * pUStk: See entry.h. + * pSys: See entry.h. + * pNonSys: !pSys + */ + +#include <linux/config.h> + +#include <asm/asmmacro.h> +#include <asm/cache.h> +#include <asm/errno.h> +#include <asm/kregs.h> +#include <asm/offsets.h> +#include <asm/pgtable.h> +#include <asm/percpu.h> +#include <asm/processor.h> +#include <asm/thread_info.h> +#include <asm/unistd.h> + +#include "minstate.h" + + /* + * execve() is special because in case of success, we need to + * setup a null register window frame. + */ +ENTRY(ia64_execve) + /* + * Allocate 8 input registers since ptrace() may clobber them + */ + .prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(8) + alloc loc1=ar.pfs,8,2,4,0 + mov loc0=rp + .body + mov out0=in0 // filename + ;; // stop bit between alloc and call + mov out1=in1 // argv + mov out2=in2 // envp + add out3=16,sp // regs + br.call.sptk.many rp=sys_execve +.ret0: +#ifdef CONFIG_IA32_SUPPORT + /* + * Check if we're returning to ia32 mode. If so, we need to restore ia32 registers + * from pt_regs. + */ + adds r16=PT(CR_IPSR)+16,sp + ;; + ld8 r16=[r16] +#endif + cmp4.ge p6,p7=r8,r0 + mov ar.pfs=loc1 // restore ar.pfs + sxt4 r8=r8 // return 64-bit result + ;; + stf.spill [sp]=f0 +(p6) cmp.ne pKStk,pUStk=r0,r0 // a successful execve() lands us in user-mode... + mov rp=loc0 +(p6) mov ar.pfs=r0 // clear ar.pfs on success +(p7) br.ret.sptk.many rp + + /* + * In theory, we'd have to zap this state only to prevent leaking of + * security sensitive state (e.g., if current->mm->dumpable is zero). However, + * this executes in less than 20 cycles even on Itanium, so it's not worth + * optimizing for...). + */ + mov ar.unat=0; mov ar.lc=0 + mov r4=0; mov f2=f0; mov b1=r0 + mov r5=0; mov f3=f0; mov b2=r0 + mov r6=0; mov f4=f0; mov b3=r0 + mov r7=0; mov f5=f0; mov b4=r0 + ldf.fill f12=[sp]; mov f13=f0; mov b5=r0 + ldf.fill f14=[sp]; ldf.fill f15=[sp]; mov f16=f0 + ldf.fill f17=[sp]; ldf.fill f18=[sp]; mov f19=f0 + ldf.fill f20=[sp]; ldf.fill f21=[sp]; mov f22=f0 + ldf.fill f23=[sp]; ldf.fill f24=[sp]; mov f25=f0 + ldf.fill f26=[sp]; ldf.fill f27=[sp]; mov f28=f0 + ldf.fill f29=[sp]; ldf.fill f30=[sp]; mov f31=f0 +#ifdef CONFIG_IA32_SUPPORT + tbit.nz p6,p0=r16, IA64_PSR_IS_BIT + movl loc0=ia64_ret_from_ia32_execve + ;; +(p6) mov rp=loc0 +#endif + br.ret.sptk.many rp +END(ia64_execve) + +/* + * sys_clone2(u64 flags, u64 ustack_base, u64 ustack_size, u64 parent_tidptr, u64 child_tidptr, + * u64 tls) + */ +GLOBAL_ENTRY(sys_clone2) + /* + * Allocate 8 input registers since ptrace() may clobber them + */ + .prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(8) + alloc r16=ar.pfs,8,2,6,0 + DO_SAVE_SWITCH_STACK + adds r2=PT(R16)+IA64_SWITCH_STACK_SIZE+16,sp + mov loc0=rp + mov loc1=r16 // save ar.pfs across do_fork + .body + mov out1=in1 + mov out3=in2 + tbit.nz p6,p0=in0,CLONE_SETTLS_BIT + mov out4=in3 // parent_tidptr: valid only w/CLONE_PARENT_SETTID + ;; +(p6) st8 [r2]=in5 // store TLS in r16 for copy_thread() + mov out5=in4 // child_tidptr: valid only w/CLONE_CHILD_SETTID or CLONE_CHILD_CLEARTID + adds out2=IA64_SWITCH_STACK_SIZE+16,sp // out2 = ®s + mov out0=in0 // out0 = clone_flags + br.call.sptk.many rp=do_fork +.ret1: .restore sp + adds sp=IA64_SWITCH_STACK_SIZE,sp // pop the switch stack + mov ar.pfs=loc1 + mov rp=loc0 + br.ret.sptk.many rp +END(sys_clone2) + +/* + * sys_clone(u64 flags, u64 ustack_base, u64 parent_tidptr, u64 child_tidptr, u64 tls) + * Deprecated. Use sys_clone2() instead. + */ +GLOBAL_ENTRY(sys_clone) + /* + * Allocate 8 input registers since ptrace() may clobber them + */ + .prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(8) + alloc r16=ar.pfs,8,2,6,0 + DO_SAVE_SWITCH_STACK + adds r2=PT(R16)+IA64_SWITCH_STACK_SIZE+16,sp + mov loc0=rp + mov loc1=r16 // save ar.pfs across do_fork + .body + mov out1=in1 + mov out3=16 // stacksize (compensates for 16-byte scratch area) + tbit.nz p6,p0=in0,CLONE_SETTLS_BIT + mov out4=in2 // parent_tidptr: valid only w/CLONE_PARENT_SETTID + ;; +(p6) st8 [r2]=in4 // store TLS in r13 (tp) + mov out5=in3 // child_tidptr: valid only w/CLONE_CHILD_SETTID or CLONE_CHILD_CLEARTID + adds out2=IA64_SWITCH_STACK_SIZE+16,sp // out2 = ®s + mov out0=in0 // out0 = clone_flags + br.call.sptk.many rp=do_fork +.ret2: .restore sp + adds sp=IA64_SWITCH_STACK_SIZE,sp // pop the switch stack + mov ar.pfs=loc1 + mov rp=loc0 + br.ret.sptk.many rp +END(sys_clone) + +/* + * prev_task <- ia64_switch_to(struct task_struct *next) + * With Ingo's new scheduler, interrupts are disabled when this routine gets + * called. The code starting at .map relies on this. The rest of the code + * doesn't care about the interrupt masking status. + */ +GLOBAL_ENTRY(ia64_switch_to) + .prologue + alloc r16=ar.pfs,1,0,0,0 + DO_SAVE_SWITCH_STACK + .body + + adds r22=IA64_TASK_THREAD_KSP_OFFSET,r13 + movl r25=init_task + mov r27=IA64_KR(CURRENT_STACK) + adds r21=IA64_TASK_THREAD_KSP_OFFSET,in0 + dep r20=0,in0,61,3 // physical address of "next" + ;; + st8 [r22]=sp // save kernel stack pointer of old task + shr.u r26=r20,IA64_GRANULE_SHIFT + cmp.eq p7,p6=r25,in0 + ;; + /* + * If we've already mapped this task's page, we can skip doing it again. + */ +(p6) cmp.eq p7,p6=r26,r27 +(p6) br.cond.dpnt .map + ;; +.done: +(p6) ssm psr.ic // if we had to map, reenable the psr.ic bit FIRST!!! + ;; +(p6) srlz.d + ld8 sp=[r21] // load kernel stack pointer of new task + mov IA64_KR(CURRENT)=in0 // update "current" application register + mov r8=r13 // return pointer to previously running task + mov r13=in0 // set "current" pointer + ;; + DO_LOAD_SWITCH_STACK + +#ifdef CONFIG_SMP + sync.i // ensure "fc"s done by this CPU are visible on other CPUs +#endif + br.ret.sptk.many rp // boogie on out in new context + +.map: + rsm psr.ic // interrupts (psr.i) are already disabled here + movl r25=PAGE_KERNEL + ;; + srlz.d + or r23=r25,r20 // construct PA | page properties + mov r25=IA64_GRANULE_SHIFT<<2 + ;; + mov cr.itir=r25 + mov cr.ifa=in0 // VA of next task... + ;; + mov r25=IA64_TR_CURRENT_STACK + mov IA64_KR(CURRENT_STACK)=r26 // remember last page we mapped... + ;; + itr.d dtr[r25]=r23 // wire in new mapping... + br.cond.sptk .done +END(ia64_switch_to) + +/* + * Note that interrupts are enabled during save_switch_stack and load_switch_stack. This + * means that we may get an interrupt with "sp" pointing to the new kernel stack while + * ar.bspstore is still pointing to the old kernel backing store area. Since ar.rsc, + * ar.rnat, ar.bsp, and ar.bspstore are all preserved by interrupts, this is not a + * problem. Also, we don't need to specify unwind information for preserved registers + * that are not modified in save_switch_stack as the right unwind information is already + * specified at the call-site of save_switch_stack. + */ + +/* + * save_switch_stack: + * - r16 holds ar.pfs + * - b7 holds address to return to + * - rp (b0) holds return address to save + */ +GLOBAL_ENTRY(save_switch_stack) + .prologue + .altrp b7 + flushrs // flush dirty regs to backing store (must be first in insn group) + .save @priunat,r17 + mov r17=ar.unat // preserve caller's + .body +#ifdef CONFIG_ITANIUM + adds r2=16+128,sp + adds r3=16+64,sp + adds r14=SW(R4)+16,sp + ;; + st8.spill [r14]=r4,16 // spill r4 + lfetch.fault.excl.nt1 [r3],128 + ;; + lfetch.fault.excl.nt1 [r2],128 + lfetch.fault.excl.nt1 [r3],128 + ;; + lfetch.fault.excl [r2] + lfetch.fault.excl [r3] + adds r15=SW(R5)+16,sp +#else + add r2=16+3*128,sp + add r3=16,sp + add r14=SW(R4)+16,sp + ;; + st8.spill [r14]=r4,SW(R6)-SW(R4) // spill r4 and prefetch offset 0x1c0 + lfetch.fault.excl.nt1 [r3],128 // prefetch offset 0x010 + ;; + lfetch.fault.excl.nt1 [r3],128 // prefetch offset 0x090 + lfetch.fault.excl.nt1 [r2],128 // prefetch offset 0x190 + ;; + lfetch.fault.excl.nt1 [r3] // prefetch offset 0x110 + lfetch.fault.excl.nt1 [r2] // prefetch offset 0x210 + adds r15=SW(R5)+16,sp +#endif + ;; + st8.spill [r15]=r5,SW(R7)-SW(R5) // spill r5 + mov.m ar.rsc=0 // put RSE in mode: enforced lazy, little endian, pl 0 + add r2=SW(F2)+16,sp // r2 = &sw->f2 + ;; + st8.spill [r14]=r6,SW(B0)-SW(R6) // spill r6 + mov.m r18=ar.fpsr // preserve fpsr + add r3=SW(F3)+16,sp // r3 = &sw->f3 + ;; + stf.spill [r2]=f2,32 + mov.m r19=ar.rnat + mov r21=b0 + + stf.spill [r3]=f3,32 + st8.spill [r15]=r7,SW(B2)-SW(R7) // spill r7 + mov r22=b1 + ;; + // since we're done with the spills, read and save ar.unat: + mov.m r29=ar.unat + mov.m r20=ar.bspstore + mov r23=b2 + stf.spill [r2]=f4,32 + stf.spill [r3]=f5,32 + mov r24=b3 + ;; + st8 [r14]=r21,SW(B1)-SW(B0) // save b0 + st8 [r15]=r23,SW(B3)-SW(B2) // save b2 + mov r25=b4 + mov r26=b5 + ;; + st8 [r14]=r22,SW(B4)-SW(B1) // save b1 + st8 [r15]=r24,SW(AR_PFS)-SW(B3) // save b3 + mov r21=ar.lc // I-unit + stf.spill [r2]=f12,32 + stf.spill [r3]=f13,32 + ;; + st8 [r14]=r25,SW(B5)-SW(B4) // save b4 + st8 [r15]=r16,SW(AR_LC)-SW(AR_PFS) // save ar.pfs + stf.spill [r2]=f14,32 + stf.spill [r3]=f15,32 + ;; + st8 [r14]=r26 // save b5 + st8 [r15]=r21 // save ar.lc + stf.spill [r2]=f16,32 + stf.spill [r3]=f17,32 + ;; + stf.spill [r2]=f18,32 + stf.spill [r3]=f19,32 + ;; + stf.spill [r2]=f20,32 + stf.spill [r3]=f21,32 + ;; + stf.spill [r2]=f22,32 + stf.spill [r3]=f23,32 + ;; + stf.spill [r2]=f24,32 + stf.spill [r3]=f25,32 + ;; + stf.spill [r2]=f26,32 + stf.spill [r3]=f27,32 + ;; + stf.spill [r2]=f28,32 + stf.spill [r3]=f29,32 + ;; + stf.spill [r2]=f30,SW(AR_UNAT)-SW(F30) + stf.spill [r3]=f31,SW(PR)-SW(F31) + add r14=SW(CALLER_UNAT)+16,sp + ;; + st8 [r2]=r29,SW(AR_RNAT)-SW(AR_UNAT) // save ar.unat + st8 [r14]=r17,SW(AR_FPSR)-SW(CALLER_UNAT) // save caller_unat + mov r21=pr + ;; + st8 [r2]=r19,SW(AR_BSPSTORE)-SW(AR_RNAT) // save ar.rnat + st8 [r3]=r21 // save predicate registers + ;; + st8 [r2]=r20 // save ar.bspstore + st8 [r14]=r18 // save fpsr + mov ar.rsc=3 // put RSE back into eager mode, pl 0 + br.cond.sptk.many b7 +END(save_switch_stack) + +/* + * load_switch_stack: + * - "invala" MUST be done at call site (normally in DO_LOAD_SWITCH_STACK) + * - b7 holds address to return to + * - must not touch r8-r11 + */ +ENTRY(load_switch_stack) + .prologue + .altrp b7 + + .body + lfetch.fault.nt1 [sp] + adds r2=SW(AR_BSPSTORE)+16,sp + adds r3=SW(AR_UNAT)+16,sp + mov ar.rsc=0 // put RSE into enforced lazy mode + adds r14=SW(CALLER_UNAT)+16,sp + adds r15=SW(AR_FPSR)+16,sp + ;; + ld8 r27=[r2],(SW(B0)-SW(AR_BSPSTORE)) // bspstore + ld8 r29=[r3],(SW(B1)-SW(AR_UNAT)) // unat + ;; + ld8 r21=[r2],16 // restore b0 + ld8 r22=[r3],16 // restore b1 + ;; + ld8 r23=[r2],16 // restore b2 + ld8 r24=[r3],16 // restore b3 + ;; + ld8 r25=[r2],16 // restore b4 + ld8 r26=[r3],16 // restore b5 + ;; + ld8 r16=[r2],(SW(PR)-SW(AR_PFS)) // ar.pfs + ld8 r17=[r3],(SW(AR_RNAT)-SW(AR_LC)) // ar.lc + ;; + ld8 r28=[r2] // restore pr + ld8 r30=[r3] // restore rnat + ;; + ld8 r18=[r14],16 // restore caller's unat + ld8 r19=[r15],24 // restore fpsr + ;; + ldf.fill f2=[r14],32 + ldf.fill f3=[r15],32 + ;; + ldf.fill f4=[r14],32 + ldf.fill f5=[r15],32 + ;; + ldf.fill f12=[r14],32 + ldf.fill f13=[r15],32 + ;; + ldf.fill f14=[r14],32 + ldf.fill f15=[r15],32 + ;; + ldf.fill f16=[r14],32 + ldf.fill f17=[r15],32 + ;; + ldf.fill f18=[r14],32 + ldf.fill f19=[r15],32 + mov b0=r21 + ;; + ldf.fill f20=[r14],32 + ldf.fill f21=[r15],32 + mov b1=r22 + ;; + ldf.fill f22=[r14],32 + ldf.fill f23=[r15],32 + mov b2=r23 + ;; + mov ar.bspstore=r27 + mov ar.unat=r29 // establish unat holding the NaT bits for r4-r7 + mov b3=r24 + ;; + ldf.fill f24=[r14],32 + ldf.fill f25=[r15],32 + mov b4=r25 + ;; + ldf.fill f26=[r14],32 + ldf.fill f27=[r15],32 + mov b5=r26 + ;; + ldf.fill f28=[r14],32 + ldf.fill f29=[r15],32 + mov ar.pfs=r16 + ;; + ldf.fill f30=[r14],32 + ldf.fill f31=[r15],24 + mov ar.lc=r17 + ;; + ld8.fill r4=[r14],16 + ld8.fill r5=[r15],16 + mov pr=r28,-1 + ;; + ld8.fill r6=[r14],16 + ld8.fill r7=[r15],16 + + mov ar.unat=r18 // restore caller's unat + mov ar.rnat=r30 // must restore after bspstore but before rsc! + mov ar.fpsr=r19 // restore fpsr + mov ar.rsc=3 // put RSE back into eager mode, pl 0 + br.cond.sptk.many b7 +END(load_switch_stack) + +GLOBAL_ENTRY(__ia64_syscall) + .regstk 6,0,0,0 + mov r15=in5 // put syscall number in place + break __BREAK_SYSCALL + movl r2=errno + cmp.eq p6,p7=-1,r10 + ;; +(p6) st4 [r2]=r8 +(p6) mov r8=-1 + br.ret.sptk.many rp +END(__ia64_syscall) + +GLOBAL_ENTRY(execve) + mov r15=__NR_execve // put syscall number in place + break __BREAK_SYSCALL + br.ret.sptk.many rp +END(execve) + +GLOBAL_ENTRY(clone) + mov r15=__NR_clone // put syscall number in place + break __BREAK_SYSCALL + br.ret.sptk.many rp +END(clone) + + /* + * Invoke a system call, but do some tracing before and after the call. + * We MUST preserve the current register frame throughout this routine + * because some system calls (such as ia64_execve) directly + * manipulate ar.pfs. + */ +GLOBAL_ENTRY(ia64_trace_syscall) + PT_REGS_UNWIND_INFO(0) + /* + * We need to preserve the scratch registers f6-f11 in case the system + * call is sigreturn. + */ + adds r16=PT(F6)+16,sp + adds r17=PT(F7)+16,sp + ;; + stf.spill [r16]=f6,32 + stf.spill [r17]=f7,32 + ;; + stf.spill [r16]=f8,32 + stf.spill [r17]=f9,32 + ;; + stf.spill [r16]=f10 + stf.spill [r17]=f11 + br.call.sptk.many rp=syscall_trace_enter // give parent a chance to catch syscall args + adds r16=PT(F6)+16,sp + adds r17=PT(F7)+16,sp + ;; + ldf.fill f6=[r16],32 + ldf.fill f7=[r17],32 + ;; + ldf.fill f8=[r16],32 + ldf.fill f9=[r17],32 + ;; + ldf.fill f10=[r16] + ldf.fill f11=[r17] + // the syscall number may have changed, so re-load it and re-calculate the + // syscall entry-point: + adds r15=PT(R15)+16,sp // r15 = &pt_regs.r15 (syscall #) + ;; + ld8 r15=[r15] + mov r3=NR_syscalls - 1 + ;; + adds r15=-1024,r15 + movl r16=sys_call_table + ;; + shladd r20=r15,3,r16 // r20 = sys_call_table + 8*(syscall-1024) + cmp.leu p6,p7=r15,r3 + ;; +(p6) ld8 r20=[r20] // load address of syscall entry point +(p7) movl r20=sys_ni_syscall + ;; + mov b6=r20 + br.call.sptk.many rp=b6 // do the syscall +.strace_check_retval: + cmp.lt p6,p0=r8,r0 // syscall failed? + adds r2=PT(R8)+16,sp // r2 = &pt_regs.r8 + adds r3=PT(R10)+16,sp // r3 = &pt_regs.r10 + mov r10=0 +(p6) br.cond.sptk strace_error // syscall failed -> + ;; // avoid RAW on r10 +.strace_save_retval: +.mem.offset 0,0; st8.spill [r2]=r8 // store return value in slot for r8 +.mem.offset 8,0; st8.spill [r3]=r10 // clear error indication in slot for r10 + br.call.sptk.many rp=syscall_trace_leave // give parent a chance to catch return value +.ret3: br.cond.sptk .work_pending_syscall_end + +strace_error: + ld8 r3=[r2] // load pt_regs.r8 + sub r9=0,r8 // negate return value to get errno value + ;; + cmp.ne p6,p0=r3,r0 // is pt_regs.r8!=0? + adds r3=16,r2 // r3=&pt_regs.r10 + ;; +(p6) mov r10=-1 +(p6) mov r8=r9 + br.cond.sptk .strace_save_retval +END(ia64_trace_syscall) + + /* + * When traced and returning from sigreturn, we invoke syscall_trace but then + * go straight to ia64_leave_kernel rather than ia64_leave_syscall. + */ +GLOBAL_ENTRY(ia64_strace_leave_kernel) + PT_REGS_UNWIND_INFO(0) +{ /* + * Some versions of gas generate bad unwind info if the first instruction of a + * procedure doesn't go into the first slot of a bundle. This is a workaround. + */ + nop.m 0 + nop.i 0 + br.call.sptk.many rp=syscall_trace_leave // give parent a chance to catch return value +} +.ret4: br.cond.sptk ia64_leave_kernel +END(ia64_strace_leave_kernel) + +GLOBAL_ENTRY(ia64_ret_from_clone) + PT_REGS_UNWIND_INFO(0) +{ /* + * Some versions of gas generate bad unwind info if the first instruction of a + * procedure doesn't go into the first slot of a bundle. This is a workaround. + */ + nop.m 0 + nop.i 0 + /* + * We need to call schedule_tail() to complete the scheduling process. + * Called by ia64_switch_to() after do_fork()->copy_thread(). r8 contains the + * address of the previously executing task. + */ + br.call.sptk.many rp=ia64_invoke_schedule_tail +} +.ret8: + adds r2=TI_FLAGS+IA64_TASK_SIZE,r13 + ;; + ld4 r2=[r2] + ;; + mov r8=0 + and r2=_TIF_SYSCALL_TRACEAUDIT,r2 + ;; + cmp.ne p6,p0=r2,r0 +(p6) br.cond.spnt .strace_check_retval + ;; // added stop bits to prevent r8 dependency +END(ia64_ret_from_clone) + // fall through +GLOBAL_ENTRY(ia64_ret_from_syscall) + PT_REGS_UNWIND_INFO(0) + cmp.ge p6,p7=r8,r0 // syscall executed successfully? + adds r2=PT(R8)+16,sp // r2 = &pt_regs.r8 + mov r10=r0 // clear error indication in r10 +(p7) br.cond.spnt handle_syscall_error // handle potential syscall failure +END(ia64_ret_from_syscall) + // fall through +/* + * ia64_leave_syscall(): Same as ia64_leave_kernel, except that it doesn't + * need to switch to bank 0 and doesn't restore the scratch registers. + * To avoid leaking kernel bits, the scratch registers are set to + * the following known-to-be-safe values: + * + * r1: restored (global pointer) + * r2: cleared + * r3: 1 (when returning to user-level) + * r8-r11: restored (syscall return value(s)) + * r12: restored (user-level stack pointer) + * r13: restored (user-level thread pointer) + * r14: cleared + * r15: restored (syscall #) + * r16-r17: cleared + * r18: user-level b6 + * r19: cleared + * r20: user-level ar.fpsr + * r21: user-level b0 + * r22: cleared + * r23: user-level ar.bspstore + * r24: user-level ar.rnat + * r25: user-level ar.unat + * r26: user-level ar.pfs + * r27: user-level ar.rsc + * r28: user-level ip + * r29: user-level psr + * r30: user-level cfm + * r31: user-level pr + * f6-f11: cleared + * pr: restored (user-level pr) + * b0: restored (user-level rp) + * b6: restored + * b7: cleared + * ar.unat: restored (user-level ar.unat) + * ar.pfs: restored (user-level ar.pfs) + * ar.rsc: restored (user-level ar.rsc) + * ar.rnat: restored (user-level ar.rnat) + * ar.bspstore: restored (user-level ar.bspstore) + * ar.fpsr: restored (user-level ar.fpsr) + * ar.ccv: cleared + * ar.csd: cleared + * ar.ssd: cleared + */ +ENTRY(ia64_leave_syscall) + PT_REGS_UNWIND_INFO(0) + /* + * work.need_resched etc. mustn't get changed by this CPU before it returns to + * user- or fsys-mode, hence we disable interrupts early on. + * + * p6 controls whether current_thread_info()->flags needs to be check for + * extra work. We always check for extra work when returning to user-level. + * With CONFIG_PREEMPT, we also check for extra work when the preempt_count + * is 0. After extra work processing has been completed, execution + * resumes at .work_processed_syscall with p6 set to 1 if the extra-work-check + * needs to be redone. + */ +#ifdef CONFIG_PREEMPT + rsm psr.i // disable interrupts + cmp.eq pLvSys,p0=r0,r0 // pLvSys=1: leave from syscall +(pKStk) adds r20=TI_PRE_COUNT+IA64_TASK_SIZE,r13 + ;; + .pred.rel.mutex pUStk,pKStk +(pKStk) ld4 r21=[r20] // r21 <- preempt_count +(pUStk) mov r21=0 // r21 <- 0 + ;; + cmp.eq p6,p0=r21,r0 // p6 <- pUStk || (preempt_count == 0) +#else /* !CONFIG_PREEMPT */ +(pUStk) rsm psr.i + cmp.eq pLvSys,p0=r0,r0 // pLvSys=1: leave from syscall +(pUStk) cmp.eq.unc p6,p0=r0,r0 // p6 <- pUStk +#endif +.work_processed_syscall: + adds r2=PT(LOADRS)+16,r12 + adds r3=PT(AR_BSPSTORE)+16,r12 + adds r18=TI_FLAGS+IA64_TASK_SIZE,r13 + ;; +(p6) ld4 r31=[r18] // load current_thread_info()->flags + ld8 r19=[r2],PT(B6)-PT(LOADRS) // load ar.rsc value for "loadrs" + mov b7=r0 // clear b7 + ;; + ld8 r23=[r3],PT(R11)-PT(AR_BSPSTORE) // load ar.bspstore (may be garbage) + ld8 r18=[r2],PT(R9)-PT(B6) // load b6 +(p6) and r15=TIF_WORK_MASK,r31 // any work other than TIF_SYSCALL_TRACE? + ;; + mov r16=ar.bsp // M2 get existing backing store pointer +(p6) cmp4.ne.unc p6,p0=r15, r0 // any special work pending? +(p6) br.cond.spnt .work_pending_syscall + ;; + // start restoring the state saved on the kernel stack (struct pt_regs): + ld8 r9=[r2],PT(CR_IPSR)-PT(R9) + ld8 r11=[r3],PT(CR_IIP)-PT(R11) + mov f6=f0 // clear f6 + ;; + invala // M0|1 invalidate ALAT + rsm psr.i | psr.ic // M2 initiate turning off of interrupt and interruption collection + mov f9=f0 // clear f9 + + ld8 r29=[r2],16 // load cr.ipsr + ld8 r28=[r3],16 // load cr.iip + mov f8=f0 // clear f8 + ;; + ld8 r30=[r2],16 // M0|1 load cr.ifs + mov.m ar.ssd=r0 // M2 clear ar.ssd + cmp.eq p9,p0=r0,r0 // set p9 to indicate that we should restore cr.ifs + ;; + ld8 r25=[r3],16 // M0|1 load ar.unat + mov.m ar.csd=r0 // M2 clear ar.csd + mov r22=r0 // clear r22 + ;; + ld8 r26=[r2],PT(B0)-PT(AR_PFS) // M0|1 load ar.pfs +(pKStk) mov r22=psr // M2 read PSR now that interrupts are disabled + mov f10=f0 // clear f10 + ;; + ld8 r21=[r2],PT(AR_RNAT)-PT(B0) // load b0 + ld8 r27=[r3],PT(PR)-PT(AR_RSC) // load ar.rsc + mov f11=f0 // clear f11 + ;; + ld8 r24=[r2],PT(AR_FPSR)-PT(AR_RNAT) // load ar.rnat (may be garbage) + ld8 r31=[r3],PT(R1)-PT(PR) // load predicates +(pUStk) add r14=IA64_TASK_THREAD_ON_USTACK_OFFSET,r13 + ;; + ld8 r20=[r2],PT(R12)-PT(AR_FPSR) // load ar.fpsr + ld8.fill r1=[r3],16 // load r1 +(pUStk) mov r17=1 + ;; + srlz.d // M0 ensure interruption collection is off + ld8.fill r13=[r3],16 + mov f7=f0 // clear f7 + ;; + ld8.fill r12=[r2] // restore r12 (sp) + ld8.fill r15=[r3] // restore r15 + addl r3=THIS_CPU(ia64_phys_stacked_size_p8),r0 + ;; +(pUStk) ld4 r3=[r3] // r3 = cpu_data->phys_stacked_size_p8 +(pUStk) st1 [r14]=r17 + mov b6=r18 // I0 restore b6 + ;; + mov r14=r0 // clear r14 + shr.u r18=r19,16 // I0|1 get byte size of existing "dirty" partition +(pKStk) br.cond.dpnt.many skip_rbs_switch + + mov.m ar.ccv=r0 // clear ar.ccv +(pNonSys) br.cond.dpnt.many dont_preserve_current_frame + br.cond.sptk.many rbs_switch +END(ia64_leave_syscall) + +#ifdef CONFIG_IA32_SUPPORT +GLOBAL_ENTRY(ia64_ret_from_ia32_execve) + PT_REGS_UNWIND_INFO(0) + adds r2=PT(R8)+16,sp // r2 = &pt_regs.r8 + adds r3=PT(R10)+16,sp // r3 = &pt_regs.r10 + ;; + .mem.offset 0,0 + st8.spill [r2]=r8 // store return value in slot for r8 and set unat bit + .mem.offset 8,0 + st8.spill [r3]=r0 // clear error indication in slot for r10 and set unat bit +END(ia64_ret_from_ia32_execve_syscall) + // fall through +#endif /* CONFIG_IA32_SUPPORT */ +GLOBAL_ENTRY(ia64_leave_kernel) + PT_REGS_UNWIND_INFO(0) + /* + * work.need_resched etc. mustn't get changed by this CPU before it returns to + * user- or fsys-mode, hence we disable interrupts early on. + * + * p6 controls whether current_thread_info()->flags needs to be check for + * extra work. We always check for extra work when returning to user-level. + * With CONFIG_PREEMPT, we also check for extra work when the preempt_count + * is 0. After extra work processing has been completed, execution + * resumes at .work_processed_syscall with p6 set to 1 if the extra-work-check + * needs to be redone. + */ +#ifdef CONFIG_PREEMPT + rsm psr.i // disable interrupts + cmp.eq p0,pLvSys=r0,r0 // pLvSys=0: leave from kernel +(pKStk) adds r20=TI_PRE_COUNT+IA64_TASK_SIZE,r13 + ;; + .pred.rel.mutex pUStk,pKStk +(pKStk) ld4 r21=[r20] // r21 <- preempt_count +(pUStk) mov r21=0 // r21 <- 0 + ;; + cmp.eq p6,p0=r21,r0 // p6 <- pUStk || (preempt_count == 0) +#else +(pUStk) rsm psr.i + cmp.eq p0,pLvSys=r0,r0 // pLvSys=0: leave from kernel +(pUStk) cmp.eq.unc p6,p0=r0,r0 // p6 <- pUStk +#endif +.work_processed_kernel: + adds r17=TI_FLAGS+IA64_TASK_SIZE,r13 + ;; +(p6) ld4 r31=[r17] // load current_thread_info()->flags + adds r21=PT(PR)+16,r12 + ;; + + lfetch [r21],PT(CR_IPSR)-PT(PR) + adds r2=PT(B6)+16,r12 + adds r3=PT(R16)+16,r12 + ;; + lfetch [r21] + ld8 r28=[r2],8 // load b6 + adds r29=PT(R24)+16,r12 + + ld8.fill r16=[r3],PT(AR_CSD)-PT(R16) + adds r30=PT(AR_CCV)+16,r12 +(p6) and r19=TIF_WORK_MASK,r31 // any work other than TIF_SYSCALL_TRACE? + ;; + ld8.fill r24=[r29] + ld8 r15=[r30] // load ar.ccv +(p6) cmp4.ne.unc p6,p0=r19, r0 // any special work pending? + ;; + ld8 r29=[r2],16 // load b7 + ld8 r30=[r3],16 // load ar.csd +(p6) br.cond.spnt .work_pending + ;; + ld8 r31=[r2],16 // load ar.ssd + ld8.fill r8=[r3],16 + ;; + ld8.fill r9=[r2],16 + ld8.fill r10=[r3],PT(R17)-PT(R10) + ;; + ld8.fill r11=[r2],PT(R18)-PT(R11) + ld8.fill r17=[r3],16 + ;; + ld8.fill r18=[r2],16 + ld8.fill r19=[r3],16 + ;; + ld8.fill r20=[r2],16 + ld8.fill r21=[r3],16 + mov ar.csd=r30 + mov ar.ssd=r31 + ;; + rsm psr.i | psr.ic // initiate turning off of interrupt and interruption collection + invala // invalidate ALAT + ;; + ld8.fill r22=[r2],24 + ld8.fill r23=[r3],24 + mov b6=r28 + ;; + ld8.fill r25=[r2],16 + ld8.fill r26=[r3],16 + mov b7=r29 + ;; + ld8.fill r27=[r2],16 + ld8.fill r28=[r3],16 + ;; + ld8.fill r29=[r2],16 + ld8.fill r30=[r3],24 + ;; + ld8.fill r31=[r2],PT(F9)-PT(R31) + adds r3=PT(F10)-PT(F6),r3 + ;; + ldf.fill f9=[r2],PT(F6)-PT(F9) + ldf.fill f10=[r3],PT(F8)-PT(F10) + ;; + ldf.fill f6=[r2],PT(F7)-PT(F6) + ;; + ldf.fill f7=[r2],PT(F11)-PT(F7) + ldf.fill f8=[r3],32 + ;; + srlz.i // ensure interruption collection is off + mov ar.ccv=r15 + ;; + ldf.fill f11=[r2] + bsw.0 // switch back to bank 0 (no stop bit required beforehand...) + ;; +(pUStk) mov r18=IA64_KR(CURRENT)// M2 (12 cycle read latency) + adds r16=PT(CR_IPSR)+16,r12 + adds r17=PT(CR_IIP)+16,r12 + +(pKStk) mov r22=psr // M2 read PSR now that interrupts are disabled + nop.i 0 + nop.i 0 + ;; + ld8 r29=[r16],16 // load cr.ipsr + ld8 r28=[r17],16 // load cr.iip + ;; + ld8 r30=[r16],16 // load cr.ifs + ld8 r25=[r17],16 // load ar.unat + ;; + ld8 r26=[r16],16 // load ar.pfs + ld8 r27=[r17],16 // load ar.rsc + cmp.eq p9,p0=r0,r0 // set p9 to indicate that we should restore cr.ifs + ;; + ld8 r24=[r16],16 // load ar.rnat (may be garbage) + ld8 r23=[r17],16 // load ar.bspstore (may be garbage) + ;; + ld8 r31=[r16],16 // load predicates + ld8 r21=[r17],16 // load b0 + ;; + ld8 r19=[r16],16 // load ar.rsc value for "loadrs" + ld8.fill r1=[r17],16 // load r1 + ;; + ld8.fill r12=[r16],16 + ld8.fill r13=[r17],16 +(pUStk) adds r18=IA64_TASK_THREAD_ON_USTACK_OFFSET,r18 + ;; + ld8 r20=[r16],16 // ar.fpsr + ld8.fill r15=[r17],16 + ;; + ld8.fill r14=[r16],16 + ld8.fill r2=[r17] +(pUStk) mov r17=1 + ;; + ld8.fill r3=[r16] +(pUStk) st1 [r18]=r17 // restore current->thread.on_ustack + shr.u r18=r19,16 // get byte size of existing "dirty" partition + ;; + mov r16=ar.bsp // get existing backing store pointer + addl r17=THIS_CPU(ia64_phys_stacked_size_p8),r0 + ;; + ld4 r17=[r17] // r17 = cpu_data->phys_stacked_size_p8 +(pKStk) br.cond.dpnt skip_rbs_switch + + /* + * Restore user backing store. + * + * NOTE: alloc, loadrs, and cover can't be predicated. + */ +(pNonSys) br.cond.dpnt dont_preserve_current_frame + +rbs_switch: + cover // add current frame into dirty partition and set cr.ifs + ;; + mov r19=ar.bsp // get new backing store pointer + sub r16=r16,r18 // krbs = old bsp - size of dirty partition + cmp.ne p9,p0=r0,r0 // clear p9 to skip restore of cr.ifs + ;; + sub r19=r19,r16 // calculate total byte size of dirty partition + add r18=64,r18 // don't force in0-in7 into memory... + ;; + shl r19=r19,16 // shift size of dirty partition into loadrs position + ;; +dont_preserve_current_frame: + /* + * To prevent leaking bits between the kernel and user-space, + * we must clear the stacked registers in the "invalid" partition here. + * Not pretty, but at least it's fast (3.34 registers/cycle on Itanium, + * 5 registers/cycle on McKinley). + */ +# define pRecurse p6 +# define pReturn p7 +#ifdef CONFIG_ITANIUM +# define Nregs 10 +#else +# define Nregs 14 +#endif + alloc loc0=ar.pfs,2,Nregs-2,2,0 + shr.u loc1=r18,9 // RNaTslots <= floor(dirtySize / (64*8)) + sub r17=r17,r18 // r17 = (physStackedSize + 8) - dirtySize + ;; + mov ar.rsc=r19 // load ar.rsc to be used for "loadrs" + shladd in0=loc1,3,r17 + mov in1=0 + ;; + TEXT_ALIGN(32) +rse_clear_invalid: +#ifdef CONFIG_ITANIUM + // cycle 0 + { .mii + alloc loc0=ar.pfs,2,Nregs-2,2,0 + cmp.lt pRecurse,p0=Nregs*8,in0 // if more than Nregs regs left to clear, (re)curse + add out0=-Nregs*8,in0 +}{ .mfb + add out1=1,in1 // increment recursion count + nop.f 0 + nop.b 0 // can't do br.call here because of alloc (WAW on CFM) + ;; +}{ .mfi // cycle 1 + mov loc1=0 + nop.f 0 + mov loc2=0 +}{ .mib + mov loc3=0 + mov loc4=0 +(pRecurse) br.call.sptk.many b0=rse_clear_invalid + +}{ .mfi // cycle 2 + mov loc5=0 + nop.f 0 + cmp.ne pReturn,p0=r0,in1 // if recursion count != 0, we need to do a br.ret +}{ .mib + mov loc6=0 + mov loc7=0 +(pReturn) br.ret.sptk.many b0 +} +#else /* !CONFIG_ITANIUM */ + alloc loc0=ar.pfs,2,Nregs-2,2,0 + cmp.lt pRecurse,p0=Nregs*8,in0 // if more than Nregs regs left to clear, (re)curse + add out0=-Nregs*8,in0 + add out1=1,in1 // increment recursion count + mov loc1=0 + mov loc2=0 + ;; + mov loc3=0 + mov loc4=0 + mov loc5=0 + mov loc6=0 + mov loc7=0 +(pRecurse) br.call.sptk.few b0=rse_clear_invalid + ;; + mov loc8=0 + mov loc9=0 + cmp.ne pReturn,p0=r0,in1 // if recursion count != 0, we need to do a br.ret + mov loc10=0 + mov loc11=0 +(pReturn) br.ret.sptk.many b0 +#endif /* !CONFIG_ITANIUM */ +# undef pRecurse +# undef pReturn + ;; + alloc r17=ar.pfs,0,0,0,0 // drop current register frame + ;; + loadrs + ;; +skip_rbs_switch: + mov ar.unat=r25 // M2 +(pKStk) extr.u r22=r22,21,1 // I0 extract current value of psr.pp from r22 +(pLvSys)mov r19=r0 // A clear r19 for leave_syscall, no-op otherwise + ;; +(pUStk) mov ar.bspstore=r23 // M2 +(pKStk) dep r29=r22,r29,21,1 // I0 update ipsr.pp with psr.pp +(pLvSys)mov r16=r0 // A clear r16 for leave_syscall, no-op otherwise + ;; + mov cr.ipsr=r29 // M2 + mov ar.pfs=r26 // I0 +(pLvSys)mov r17=r0 // A clear r17 for leave_syscall, no-op otherwise + +(p9) mov cr.ifs=r30 // M2 + mov b0=r21 // I0 +(pLvSys)mov r18=r0 // A clear r18 for leave_syscall, no-op otherwise + + mov ar.fpsr=r20 // M2 + mov cr.iip=r28 // M2 + nop 0 + ;; +(pUStk) mov ar.rnat=r24 // M2 must happen with RSE in lazy mode + nop 0 +(pLvSys)mov r2=r0 + + mov ar.rsc=r27 // M2 + mov pr=r31,-1 // I0 + rfi // B + + /* + * On entry: + * r20 = ¤t->thread_info->pre_count (if CONFIG_PREEMPT) + * r31 = current->thread_info->flags + * On exit: + * p6 = TRUE if work-pending-check needs to be redone + */ +.work_pending_syscall: + add r2=-8,r2 + add r3=-8,r3 + ;; + st8 [r2]=r8 + st8 [r3]=r10 +.work_pending: + tbit.nz p6,p0=r31,TIF_SIGDELAYED // signal delayed from MCA/INIT/NMI/PMI context? +(p6) br.cond.sptk.few .sigdelayed + ;; + tbit.z p6,p0=r31,TIF_NEED_RESCHED // current_thread_info()->need_resched==0? +(p6) br.cond.sptk.few .notify +#ifdef CONFIG_PREEMPT +(pKStk) dep r21=-1,r0,PREEMPT_ACTIVE_BIT,1 + ;; +(pKStk) st4 [r20]=r21 + ssm psr.i // enable interrupts +#endif + br.call.spnt.many rp=schedule +.ret9: cmp.eq p6,p0=r0,r0 // p6 <- 1 + rsm psr.i // disable interrupts + ;; +#ifdef CONFIG_PREEMPT +(pKStk) adds r20=TI_PRE_COUNT+IA64_TASK_SIZE,r13 + ;; +(pKStk) st4 [r20]=r0 // preempt_count() <- 0 +#endif +(pLvSys)br.cond.sptk.few .work_pending_syscall_end + br.cond.sptk.many .work_processed_kernel // re-check + +.notify: +(pUStk) br.call.spnt.many rp=notify_resume_user +.ret10: cmp.ne p6,p0=r0,r0 // p6 <- 0 +(pLvSys)br.cond.sptk.few .work_pending_syscall_end + br.cond.sptk.many .work_processed_kernel // don't re-check + +// There is a delayed signal that was detected in MCA/INIT/NMI/PMI context where +// it could not be delivered. Deliver it now. The signal might be for us and +// may set TIF_SIGPENDING, so redrive ia64_leave_* after processing the delayed +// signal. + +.sigdelayed: + br.call.sptk.many rp=do_sigdelayed + cmp.eq p6,p0=r0,r0 // p6 <- 1, always re-check +(pLvSys)br.cond.sptk.few .work_pending_syscall_end + br.cond.sptk.many .work_processed_kernel // re-check + +.work_pending_syscall_end: + adds r2=PT(R8)+16,r12 + adds r3=PT(R10)+16,r12 + ;; + ld8 r8=[r2] + ld8 r10=[r3] + br.cond.sptk.many .work_processed_syscall // re-check + +END(ia64_leave_kernel) + +ENTRY(handle_syscall_error) + /* + * Some system calls (e.g., ptrace, mmap) can return arbitrary values which could + * lead us to mistake a negative return value as a failed syscall. Those syscall + * must deposit a non-zero value in pt_regs.r8 to indicate an error. If + * pt_regs.r8 is zero, we assume that the call completed successfully. + */ + PT_REGS_UNWIND_INFO(0) + ld8 r3=[r2] // load pt_regs.r8 + ;; + cmp.eq p6,p7=r3,r0 // is pt_regs.r8==0? + ;; +(p7) mov r10=-1 +(p7) sub r8=0,r8 // negate return value to get errno + br.cond.sptk ia64_leave_syscall +END(handle_syscall_error) + + /* + * Invoke schedule_tail(task) while preserving in0-in7, which may be needed + * in case a system call gets restarted. + */ +GLOBAL_ENTRY(ia64_invoke_schedule_tail) + .prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(8) + alloc loc1=ar.pfs,8,2,1,0 + mov loc0=rp + mov out0=r8 // Address of previous task + ;; + br.call.sptk.many rp=schedule_tail +.ret11: mov ar.pfs=loc1 + mov rp=loc0 + br.ret.sptk.many rp +END(ia64_invoke_schedule_tail) + + /* + * Setup stack and call do_notify_resume_user(). Note that pSys and pNonSys need to + * be set up by the caller. We declare 8 input registers so the system call + * args get preserved, in case we need to restart a system call. + */ +ENTRY(notify_resume_user) + .prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(8) + alloc loc1=ar.pfs,8,2,3,0 // preserve all eight input regs in case of syscall restart! + mov r9=ar.unat + mov loc0=rp // save return address + mov out0=0 // there is no "oldset" + adds out1=8,sp // out1=&sigscratch->ar_pfs +(pSys) mov out2=1 // out2==1 => we're in a syscall + ;; +(pNonSys) mov out2=0 // out2==0 => not a syscall + .fframe 16 + .spillpsp ar.unat, 16 // (note that offset is relative to psp+0x10!) + st8 [sp]=r9,-16 // allocate space for ar.unat and save it + st8 [out1]=loc1,-8 // save ar.pfs, out1=&sigscratch + .body + br.call.sptk.many rp=do_notify_resume_user +.ret15: .restore sp + adds sp=16,sp // pop scratch stack space + ;; + ld8 r9=[sp] // load new unat from sigscratch->scratch_unat + mov rp=loc0 + ;; + mov ar.unat=r9 + mov ar.pfs=loc1 + br.ret.sptk.many rp +END(notify_resume_user) + +GLOBAL_ENTRY(sys_rt_sigsuspend) + .prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(8) + alloc loc1=ar.pfs,8,2,3,0 // preserve all eight input regs in case of syscall restart! + mov r9=ar.unat + mov loc0=rp // save return address + mov out0=in0 // mask + mov out1=in1 // sigsetsize + adds out2=8,sp // out2=&sigscratch->ar_pfs + ;; + .fframe 16 + .spillpsp ar.unat, 16 // (note that offset is relative to psp+0x10!) + st8 [sp]=r9,-16 // allocate space for ar.unat and save it + st8 [out2]=loc1,-8 // save ar.pfs, out2=&sigscratch + .body + br.call.sptk.many rp=ia64_rt_sigsuspend +.ret17: .restore sp + adds sp=16,sp // pop scratch stack space + ;; + ld8 r9=[sp] // load new unat from sw->caller_unat + mov rp=loc0 + ;; + mov ar.unat=r9 + mov ar.pfs=loc1 + br.ret.sptk.many rp +END(sys_rt_sigsuspend) + +ENTRY(sys_rt_sigreturn) + PT_REGS_UNWIND_INFO(0) + /* + * Allocate 8 input registers since ptrace() may clobber them + */ + alloc r2=ar.pfs,8,0,1,0 + .prologue + PT_REGS_SAVES(16) + adds sp=-16,sp + .body + cmp.eq pNonSys,pSys=r0,r0 // sigreturn isn't a normal syscall... + ;; + /* + * leave_kernel() restores f6-f11 from pt_regs, but since the streamlined + * syscall-entry path does not save them we save them here instead. Note: we + * don't need to save any other registers that are not saved by the stream-lined + * syscall path, because restore_sigcontext() restores them. + */ + adds r16=PT(F6)+32,sp + adds r17=PT(F7)+32,sp + ;; + stf.spill [r16]=f6,32 + stf.spill [r17]=f7,32 + ;; + stf.spill [r16]=f8,32 + stf.spill [r17]=f9,32 + ;; + stf.spill [r16]=f10 + stf.spill [r17]=f11 + adds out0=16,sp // out0 = &sigscratch + br.call.sptk.many rp=ia64_rt_sigreturn +.ret19: .restore sp 0 + adds sp=16,sp + ;; + ld8 r9=[sp] // load new ar.unat + mov.sptk b7=r8,ia64_leave_kernel + ;; + mov ar.unat=r9 + br.many b7 +END(sys_rt_sigreturn) + +GLOBAL_ENTRY(ia64_prepare_handle_unaligned) + .prologue + /* + * r16 = fake ar.pfs, we simply need to make sure privilege is still 0 + */ + mov r16=r0 + DO_SAVE_SWITCH_STACK + br.call.sptk.many rp=ia64_handle_unaligned // stack frame setup in ivt +.ret21: .body + DO_LOAD_SWITCH_STACK + br.cond.sptk.many rp // goes to ia64_leave_kernel +END(ia64_prepare_handle_unaligned) + + // + // unw_init_running(void (*callback)(info, arg), void *arg) + // +# define EXTRA_FRAME_SIZE ((UNW_FRAME_INFO_SIZE+15)&~15) + +GLOBAL_ENTRY(unw_init_running) + .prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(2) + alloc loc1=ar.pfs,2,3,3,0 + ;; + ld8 loc2=[in0],8 + mov loc0=rp + mov r16=loc1 + DO_SAVE_SWITCH_STACK + .body + + .prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(2) + .fframe IA64_SWITCH_STACK_SIZE+EXTRA_FRAME_SIZE + SWITCH_STACK_SAVES(EXTRA_FRAME_SIZE) + adds sp=-EXTRA_FRAME_SIZE,sp + .body + ;; + adds out0=16,sp // &info + mov out1=r13 // current + adds out2=16+EXTRA_FRAME_SIZE,sp // &switch_stack + br.call.sptk.many rp=unw_init_frame_info +1: adds out0=16,sp // &info + mov b6=loc2 + mov loc2=gp // save gp across indirect function call + ;; + ld8 gp=[in0] + mov out1=in1 // arg + br.call.sptk.many rp=b6 // invoke the callback function +1: mov gp=loc2 // restore gp + + // For now, we don't allow changing registers from within + // unw_init_running; if we ever want to allow that, we'd + // have to do a load_switch_stack here: + .restore sp + adds sp=IA64_SWITCH_STACK_SIZE+EXTRA_FRAME_SIZE,sp + + mov ar.pfs=loc1 + mov rp=loc0 + br.ret.sptk.many rp +END(unw_init_running) + + .rodata + .align 8 + .globl sys_call_table +sys_call_table: + data8 sys_ni_syscall // This must be sys_ni_syscall! See ivt.S. + data8 sys_exit // 1025 + data8 sys_read + data8 sys_write + data8 sys_open + data8 sys_close + data8 sys_creat // 1030 + data8 sys_link + data8 sys_unlink + data8 ia64_execve + data8 sys_chdir + data8 sys_fchdir // 1035 + data8 sys_utimes + data8 sys_mknod + data8 sys_chmod + data8 sys_chown + data8 sys_lseek // 1040 + data8 sys_getpid + data8 sys_getppid + data8 sys_mount + data8 sys_umount + data8 sys_setuid // 1045 + data8 sys_getuid + data8 sys_geteuid + data8 sys_ptrace + data8 sys_access + data8 sys_sync // 1050 + data8 sys_fsync + data8 sys_fdatasync + data8 sys_kill + data8 sys_rename + data8 sys_mkdir // 1055 + data8 sys_rmdir + data8 sys_dup + data8 sys_pipe + data8 sys_times + data8 ia64_brk // 1060 + data8 sys_setgid + data8 sys_getgid + data8 sys_getegid + data8 sys_acct + data8 sys_ioctl // 1065 + data8 sys_fcntl + data8 sys_umask + data8 sys_chroot + data8 sys_ustat + data8 sys_dup2 // 1070 + data8 sys_setreuid + data8 sys_setregid + data8 sys_getresuid + data8 sys_setresuid + data8 sys_getresgid // 1075 + data8 sys_setresgid + data8 sys_getgroups + data8 sys_setgroups + data8 sys_getpgid + data8 sys_setpgid // 1080 + data8 sys_setsid + data8 sys_getsid + data8 sys_sethostname + data8 sys_setrlimit + data8 sys_getrlimit // 1085 + data8 sys_getrusage + data8 sys_gettimeofday + data8 sys_settimeofday + data8 sys_select + data8 sys_poll // 1090 + data8 sys_symlink + data8 sys_readlink + data8 sys_uselib + data8 sys_swapon + data8 sys_swapoff // 1095 + data8 sys_reboot + data8 sys_truncate + data8 sys_ftruncate + data8 sys_fchmod + data8 sys_fchown // 1100 + data8 ia64_getpriority + data8 sys_setpriority + data8 sys_statfs + data8 sys_fstatfs + data8 sys_gettid // 1105 + data8 sys_semget + data8 sys_semop + data8 sys_semctl + data8 sys_msgget + data8 sys_msgsnd // 1110 + data8 sys_msgrcv + data8 sys_msgctl + data8 sys_shmget + data8 ia64_shmat + data8 sys_shmdt // 1115 + data8 sys_shmctl + data8 sys_syslog + data8 sys_setitimer + data8 sys_getitimer + data8 sys_ni_syscall // 1120 /* was: ia64_oldstat */ + data8 sys_ni_syscall /* was: ia64_oldlstat */ + data8 sys_ni_syscall /* was: ia64_oldfstat */ + data8 sys_vhangup + data8 sys_lchown + data8 sys_remap_file_pages // 1125 + data8 sys_wait4 + data8 sys_sysinfo + data8 sys_clone + data8 sys_setdomainname + data8 sys_newuname // 1130 + data8 sys_adjtimex + data8 sys_ni_syscall /* was: ia64_create_module */ + data8 sys_init_module + data8 sys_delete_module + data8 sys_ni_syscall // 1135 /* was: sys_get_kernel_syms */ + data8 sys_ni_syscall /* was: sys_query_module */ + data8 sys_quotactl + data8 sys_bdflush + data8 sys_sysfs + data8 sys_personality // 1140 + data8 sys_ni_syscall // sys_afs_syscall + data8 sys_setfsuid + data8 sys_setfsgid + data8 sys_getdents + data8 sys_flock // 1145 + data8 sys_readv + data8 sys_writev + data8 sys_pread64 + data8 sys_pwrite64 + data8 sys_sysctl // 1150 + data8 sys_mmap + data8 sys_munmap + data8 sys_mlock + data8 sys_mlockall + data8 sys_mprotect // 1155 + data8 ia64_mremap + data8 sys_msync + data8 sys_munlock + data8 sys_munlockall + data8 sys_sched_getparam // 1160 + data8 sys_sched_setparam + data8 sys_sched_getscheduler + data8 sys_sched_setscheduler + data8 sys_sched_yield + data8 sys_sched_get_priority_max // 1165 + data8 sys_sched_get_priority_min + data8 sys_sched_rr_get_interval + data8 sys_nanosleep + data8 sys_nfsservctl + data8 sys_prctl // 1170 + data8 sys_getpagesize + data8 sys_mmap2 + data8 sys_pciconfig_read + data8 sys_pciconfig_write + data8 sys_perfmonctl // 1175 + data8 sys_sigaltstack + data8 sys_rt_sigaction + data8 sys_rt_sigpending + data8 sys_rt_sigprocmask + data8 sys_rt_sigqueueinfo // 1180 + data8 sys_rt_sigreturn + data8 sys_rt_sigsuspend + data8 sys_rt_sigtimedwait + data8 sys_getcwd + data8 sys_capget // 1185 + data8 sys_capset + data8 sys_sendfile64 + data8 sys_ni_syscall // sys_getpmsg (STREAMS) + data8 sys_ni_syscall // sys_putpmsg (STREAMS) + data8 sys_socket // 1190 + data8 sys_bind + data8 sys_connect + data8 sys_listen + data8 sys_accept + data8 sys_getsockname // 1195 + data8 sys_getpeername + data8 sys_socketpair + data8 sys_send + data8 sys_sendto + data8 sys_recv // 1200 + data8 sys_recvfrom + data8 sys_shutdown + data8 sys_setsockopt + data8 sys_getsockopt + data8 sys_sendmsg // 1205 + data8 sys_recvmsg + data8 sys_pivot_root + data8 sys_mincore + data8 sys_madvise + data8 sys_newstat // 1210 + data8 sys_newlstat + data8 sys_newfstat + data8 sys_clone2 + data8 sys_getdents64 + data8 sys_getunwind // 1215 + data8 sys_readahead + data8 sys_setxattr + data8 sys_lsetxattr + data8 sys_fsetxattr + data8 sys_getxattr // 1220 + data8 sys_lgetxattr + data8 sys_fgetxattr + data8 sys_listxattr + data8 sys_llistxattr + data8 sys_flistxattr // 1225 + data8 sys_removexattr + data8 sys_lremovexattr + data8 sys_fremovexattr + data8 sys_tkill + data8 sys_futex // 1230 + data8 sys_sched_setaffinity + data8 sys_sched_getaffinity + data8 sys_set_tid_address + data8 sys_fadvise64_64 + data8 sys_tgkill // 1235 + data8 sys_exit_group + data8 sys_lookup_dcookie + data8 sys_io_setup + data8 sys_io_destroy + data8 sys_io_getevents // 1240 + data8 sys_io_submit + data8 sys_io_cancel + data8 sys_epoll_create + data8 sys_epoll_ctl + data8 sys_epoll_wait // 1245 + data8 sys_restart_syscall + data8 sys_semtimedop + data8 sys_timer_create + data8 sys_timer_settime + data8 sys_timer_gettime // 1250 + data8 sys_timer_getoverrun + data8 sys_timer_delete + data8 sys_clock_settime + data8 sys_clock_gettime + data8 sys_clock_getres // 1255 + data8 sys_clock_nanosleep + data8 sys_fstatfs64 + data8 sys_statfs64 + data8 sys_mbind + data8 sys_get_mempolicy // 1260 + data8 sys_set_mempolicy + data8 sys_mq_open + data8 sys_mq_unlink + data8 sys_mq_timedsend + data8 sys_mq_timedreceive // 1265 + data8 sys_mq_notify + data8 sys_mq_getsetattr + data8 sys_ni_syscall // reserved for kexec_load + data8 sys_ni_syscall // reserved for vserver + data8 sys_waitid // 1270 + data8 sys_add_key + data8 sys_request_key + data8 sys_keyctl + data8 sys_ni_syscall + data8 sys_ni_syscall // 1275 + data8 sys_ni_syscall + data8 sys_ni_syscall + data8 sys_ni_syscall + data8 sys_ni_syscall + + .org sys_call_table + 8*NR_syscalls // guard against failures to increase NR_syscalls diff --git a/arch/ia64/kernel/entry.h b/arch/ia64/kernel/entry.h new file mode 100644 index 000000000000..6d4ecec989b5 --- /dev/null +++ b/arch/ia64/kernel/entry.h @@ -0,0 +1,82 @@ +#include <linux/config.h> + +/* + * Preserved registers that are shared between code in ivt.S and + * entry.S. Be careful not to step on these! + */ +#define PRED_LEAVE_SYSCALL 1 /* TRUE iff leave from syscall */ +#define PRED_KERNEL_STACK 2 /* returning to kernel-stacks? */ +#define PRED_USER_STACK 3 /* returning to user-stacks? */ +#define PRED_SYSCALL 4 /* inside a system call? */ +#define PRED_NON_SYSCALL 5 /* complement of PRED_SYSCALL */ + +#ifdef __ASSEMBLY__ +# define PASTE2(x,y) x##y +# define PASTE(x,y) PASTE2(x,y) + +# define pLvSys PASTE(p,PRED_LEAVE_SYSCALL) +# define pKStk PASTE(p,PRED_KERNEL_STACK) +# define pUStk PASTE(p,PRED_USER_STACK) +# define pSys PASTE(p,PRED_SYSCALL) +# define pNonSys PASTE(p,PRED_NON_SYSCALL) +#endif + +#define PT(f) (IA64_PT_REGS_##f##_OFFSET) +#define SW(f) (IA64_SWITCH_STACK_##f##_OFFSET) + +#define PT_REGS_SAVES(off) \ + .unwabi 3, 'i'; \ + .fframe IA64_PT_REGS_SIZE+16+(off); \ + .spillsp rp, PT(CR_IIP)+16+(off); \ + .spillsp ar.pfs, PT(CR_IFS)+16+(off); \ + .spillsp ar.unat, PT(AR_UNAT)+16+(off); \ + .spillsp ar.fpsr, PT(AR_FPSR)+16+(off); \ + .spillsp pr, PT(PR)+16+(off); + +#define PT_REGS_UNWIND_INFO(off) \ + .prologue; \ + PT_REGS_SAVES(off); \ + .body + +#define SWITCH_STACK_SAVES(off) \ + .savesp ar.unat,SW(CALLER_UNAT)+16+(off); \ + .savesp ar.fpsr,SW(AR_FPSR)+16+(off); \ + .spillsp f2,SW(F2)+16+(off); .spillsp f3,SW(F3)+16+(off); \ + .spillsp f4,SW(F4)+16+(off); .spillsp f5,SW(F5)+16+(off); \ + .spillsp f16,SW(F16)+16+(off); .spillsp f17,SW(F17)+16+(off); \ + .spillsp f18,SW(F18)+16+(off); .spillsp f19,SW(F19)+16+(off); \ + .spillsp f20,SW(F20)+16+(off); .spillsp f21,SW(F21)+16+(off); \ + .spillsp f22,SW(F22)+16+(off); .spillsp f23,SW(F23)+16+(off); \ + .spillsp f24,SW(F24)+16+(off); .spillsp f25,SW(F25)+16+(off); \ + .spillsp f26,SW(F26)+16+(off); .spillsp f27,SW(F27)+16+(off); \ + .spillsp f28,SW(F28)+16+(off); .spillsp f29,SW(F29)+16+(off); \ + .spillsp f30,SW(F30)+16+(off); .spillsp f31,SW(F31)+16+(off); \ + .spillsp r4,SW(R4)+16+(off); .spillsp r5,SW(R5)+16+(off); \ + .spillsp r6,SW(R6)+16+(off); .spillsp r7,SW(R7)+16+(off); \ + .spillsp b0,SW(B0)+16+(off); .spillsp b1,SW(B1)+16+(off); \ + .spillsp b2,SW(B2)+16+(off); .spillsp b3,SW(B3)+16+(off); \ + .spillsp b4,SW(B4)+16+(off); .spillsp b5,SW(B5)+16+(off); \ + .spillsp ar.pfs,SW(AR_PFS)+16+(off); .spillsp ar.lc,SW(AR_LC)+16+(off); \ + .spillsp @priunat,SW(AR_UNAT)+16+(off); \ + .spillsp ar.rnat,SW(AR_RNAT)+16+(off); \ + .spillsp ar.bspstore,SW(AR_BSPSTORE)+16+(off); \ + .spillsp pr,SW(PR)+16+(off)) + +#define DO_SAVE_SWITCH_STACK \ + movl r28=1f; \ + ;; \ + .fframe IA64_SWITCH_STACK_SIZE; \ + adds sp=-IA64_SWITCH_STACK_SIZE,sp; \ + mov.ret.sptk b7=r28,1f; \ + SWITCH_STACK_SAVES(0); \ + br.cond.sptk.many save_switch_stack; \ +1: + +#define DO_LOAD_SWITCH_STACK \ + movl r28=1f; \ + ;; \ + invala; \ + mov.ret.sptk b7=r28,1f; \ + br.cond.sptk.many load_switch_stack; \ +1: .restore sp; \ + adds sp=IA64_SWITCH_STACK_SIZE,sp diff --git a/arch/ia64/kernel/fsys.S b/arch/ia64/kernel/fsys.S new file mode 100644 index 000000000000..0d8650f7fce7 --- /dev/null +++ b/arch/ia64/kernel/fsys.S @@ -0,0 +1,884 @@ +/* + * This file contains the light-weight system call handlers (fsyscall-handlers). + * + * Copyright (C) 2003 Hewlett-Packard Co + * David Mosberger-Tang <davidm@hpl.hp.com> + * + * 25-Sep-03 davidm Implement fsys_rt_sigprocmask(). + * 18-Feb-03 louisk Implement fsys_gettimeofday(). + * 28-Feb-03 davidm Fixed several bugs in fsys_gettimeofday(). Tuned it some more, + * probably broke it along the way... ;-) + * 13-Jul-04 clameter Implement fsys_clock_gettime and revise fsys_gettimeofday to make + * it capable of using memory based clocks without falling back to C code. + */ + +#include <asm/asmmacro.h> +#include <asm/errno.h> +#include <asm/offsets.h> +#include <asm/percpu.h> +#include <asm/thread_info.h> +#include <asm/sal.h> +#include <asm/signal.h> +#include <asm/system.h> +#include <asm/unistd.h> + +#include "entry.h" + +/* + * See Documentation/ia64/fsys.txt for details on fsyscalls. + * + * On entry to an fsyscall handler: + * r10 = 0 (i.e., defaults to "successful syscall return") + * r11 = saved ar.pfs (a user-level value) + * r15 = system call number + * r16 = "current" task pointer (in normal kernel-mode, this is in r13) + * r32-r39 = system call arguments + * b6 = return address (a user-level value) + * ar.pfs = previous frame-state (a user-level value) + * PSR.be = cleared to zero (i.e., little-endian byte order is in effect) + * all other registers may contain values passed in from user-mode + * + * On return from an fsyscall handler: + * r11 = saved ar.pfs (as passed into the fsyscall handler) + * r15 = system call number (as passed into the fsyscall handler) + * r32-r39 = system call arguments (as passed into the fsyscall handler) + * b6 = return address (as passed into the fsyscall handler) + * ar.pfs = previous frame-state (as passed into the fsyscall handler) + */ + +ENTRY(fsys_ni_syscall) + .prologue + .altrp b6 + .body + mov r8=ENOSYS + mov r10=-1 + FSYS_RETURN +END(fsys_ni_syscall) + +ENTRY(fsys_getpid) + .prologue + .altrp b6 + .body + add r9=TI_FLAGS+IA64_TASK_SIZE,r16 + ;; + ld4 r9=[r9] + add r8=IA64_TASK_TGID_OFFSET,r16 + ;; + and r9=TIF_ALLWORK_MASK,r9 + ld4 r8=[r8] // r8 = current->tgid + ;; + cmp.ne p8,p0=0,r9 +(p8) br.spnt.many fsys_fallback_syscall + FSYS_RETURN +END(fsys_getpid) + +ENTRY(fsys_getppid) + .prologue + .altrp b6 + .body + add r17=IA64_TASK_GROUP_LEADER_OFFSET,r16 + ;; + ld8 r17=[r17] // r17 = current->group_leader + add r9=TI_FLAGS+IA64_TASK_SIZE,r16 + ;; + + ld4 r9=[r9] + add r17=IA64_TASK_REAL_PARENT_OFFSET,r17 // r17 = ¤t->group_leader->real_parent + ;; + and r9=TIF_ALLWORK_MASK,r9 + +1: ld8 r18=[r17] // r18 = current->group_leader->real_parent + ;; + cmp.ne p8,p0=0,r9 + add r8=IA64_TASK_TGID_OFFSET,r18 // r8 = ¤t->group_leader->real_parent->tgid + ;; + + /* + * The .acq is needed to ensure that the read of tgid has returned its data before + * we re-check "real_parent". + */ + ld4.acq r8=[r8] // r8 = current->group_leader->real_parent->tgid +#ifdef CONFIG_SMP + /* + * Re-read current->group_leader->real_parent. + */ + ld8 r19=[r17] // r19 = current->group_leader->real_parent +(p8) br.spnt.many fsys_fallback_syscall + ;; + cmp.ne p6,p0=r18,r19 // did real_parent change? + mov r19=0 // i must not leak kernel bits... +(p6) br.cond.spnt.few 1b // yes -> redo the read of tgid and the check + ;; + mov r17=0 // i must not leak kernel bits... + mov r18=0 // i must not leak kernel bits... +#else + mov r17=0 // i must not leak kernel bits... + mov r18=0 // i must not leak kernel bits... + mov r19=0 // i must not leak kernel bits... +#endif + FSYS_RETURN +END(fsys_getppid) + +ENTRY(fsys_set_tid_address) + .prologue + .altrp b6 + .body + add r9=TI_FLAGS+IA64_TASK_SIZE,r16 + ;; + ld4 r9=[r9] + tnat.z p6,p7=r32 // check argument register for being NaT + ;; + and r9=TIF_ALLWORK_MASK,r9 + add r8=IA64_TASK_PID_OFFSET,r16 + add r18=IA64_TASK_CLEAR_CHILD_TID_OFFSET,r16 + ;; + ld4 r8=[r8] + cmp.ne p8,p0=0,r9 + mov r17=-1 + ;; +(p6) st8 [r18]=r32 +(p7) st8 [r18]=r17 +(p8) br.spnt.many fsys_fallback_syscall + ;; + mov r17=0 // i must not leak kernel bits... + mov r18=0 // i must not leak kernel bits... + FSYS_RETURN +END(fsys_set_tid_address) + +/* + * Ensure that the time interpolator structure is compatible with the asm code + */ +#if IA64_TIME_INTERPOLATOR_SOURCE_OFFSET !=0 || IA64_TIME_INTERPOLATOR_SHIFT_OFFSET != 2 \ + || IA64_TIME_INTERPOLATOR_JITTER_OFFSET != 3 || IA64_TIME_INTERPOLATOR_NSEC_OFFSET != 4 +#error fsys_gettimeofday incompatible with changes to struct time_interpolator +#endif +#define CLOCK_REALTIME 0 +#define CLOCK_MONOTONIC 1 +#define CLOCK_DIVIDE_BY_1000 0x4000 +#define CLOCK_ADD_MONOTONIC 0x8000 + +ENTRY(fsys_gettimeofday) + .prologue + .altrp b6 + .body + mov r31 = r32 + tnat.nz p6,p0 = r33 // guard against NaT argument +(p6) br.cond.spnt.few .fail_einval + mov r30 = CLOCK_DIVIDE_BY_1000 + ;; +.gettime: + // Register map + // Incoming r31 = pointer to address where to place result + // r30 = flags determining how time is processed + // r2,r3 = temp r4-r7 preserved + // r8 = result nanoseconds + // r9 = result seconds + // r10 = temporary storage for clock difference + // r11 = preserved: saved ar.pfs + // r12 = preserved: memory stack + // r13 = preserved: thread pointer + // r14 = address of mask / mask + // r15 = preserved: system call number + // r16 = preserved: current task pointer + // r17 = wall to monotonic use + // r18 = time_interpolator->offset + // r19 = address of wall_to_monotonic + // r20 = pointer to struct time_interpolator / pointer to time_interpolator->address + // r21 = shift factor + // r22 = address of time interpolator->last_counter + // r23 = address of time_interpolator->last_cycle + // r24 = adress of time_interpolator->offset + // r25 = last_cycle value + // r26 = last_counter value + // r27 = pointer to xtime + // r28 = sequence number at the beginning of critcal section + // r29 = address of seqlock + // r30 = time processing flags / memory address + // r31 = pointer to result + // Predicates + // p6,p7 short term use + // p8 = timesource ar.itc + // p9 = timesource mmio64 + // p10 = timesource mmio32 + // p11 = timesource not to be handled by asm code + // p12 = memory time source ( = p9 | p10) + // p13 = do cmpxchg with time_interpolator_last_cycle + // p14 = Divide by 1000 + // p15 = Add monotonic + // + // Note that instructions are optimized for McKinley. McKinley can process two + // bundles simultaneously and therefore we continuously try to feed the CPU + // two bundles and then a stop. + tnat.nz p6,p0 = r31 // branch deferred since it does not fit into bundle structure + mov pr = r30,0xc000 // Set predicates according to function + add r2 = TI_FLAGS+IA64_TASK_SIZE,r16 + movl r20 = time_interpolator + ;; + ld8 r20 = [r20] // get pointer to time_interpolator structure + movl r29 = xtime_lock + ld4 r2 = [r2] // process work pending flags + movl r27 = xtime + ;; // only one bundle here + ld8 r21 = [r20] // first quad with control information + and r2 = TIF_ALLWORK_MASK,r2 +(p6) br.cond.spnt.few .fail_einval // deferred branch + ;; + add r10 = IA64_TIME_INTERPOLATOR_ADDRESS_OFFSET,r20 + extr r3 = r21,32,32 // time_interpolator->nsec_per_cyc + extr r8 = r21,0,16 // time_interpolator->source + cmp.ne p6, p0 = 0, r2 // Fallback if work is scheduled +(p6) br.cond.spnt.many fsys_fallback_syscall + ;; + cmp.eq p8,p12 = 0,r8 // Check for cpu timer + cmp.eq p9,p0 = 1,r8 // MMIO64 ? + extr r2 = r21,24,8 // time_interpolator->jitter + cmp.eq p10,p0 = 2,r8 // MMIO32 ? + cmp.ltu p11,p0 = 2,r8 // function or other clock +(p11) br.cond.spnt.many fsys_fallback_syscall + ;; + setf.sig f7 = r3 // Setup for scaling of counter +(p15) movl r19 = wall_to_monotonic +(p12) ld8 r30 = [r10] + cmp.ne p13,p0 = r2,r0 // need jitter compensation? + extr r21 = r21,16,8 // shift factor + ;; +.time_redo: + .pred.rel.mutex p8,p9,p10 + ld4.acq r28 = [r29] // xtime_lock.sequence. Must come first for locking purposes +(p8) mov r2 = ar.itc // CPU_TIMER. 36 clocks latency!!! + add r22 = IA64_TIME_INTERPOLATOR_LAST_COUNTER_OFFSET,r20 +(p9) ld8 r2 = [r30] // readq(ti->address). Could also have latency issues.. +(p10) ld4 r2 = [r30] // readw(ti->address) +(p13) add r23 = IA64_TIME_INTERPOLATOR_LAST_CYCLE_OFFSET,r20 + ;; // could be removed by moving the last add upward + ld8 r26 = [r22] // time_interpolator->last_counter +(p13) ld8 r25 = [r23] // time interpolator->last_cycle + add r24 = IA64_TIME_INTERPOLATOR_OFFSET_OFFSET,r20 +(p15) ld8 r17 = [r19],IA64_TIMESPEC_TV_NSEC_OFFSET + ld8 r9 = [r27],IA64_TIMESPEC_TV_NSEC_OFFSET + add r14 = IA64_TIME_INTERPOLATOR_MASK_OFFSET, r20 + ;; + ld8 r18 = [r24] // time_interpolator->offset + ld8 r8 = [r27],-IA64_TIMESPEC_TV_NSEC_OFFSET // xtime.tv_nsec +(p13) sub r3 = r25,r2 // Diff needed before comparison (thanks davidm) + ;; + ld8 r14 = [r14] // time_interpolator->mask +(p13) cmp.gt.unc p6,p7 = r3,r0 // check if it is less than last. p6,p7 cleared + sub r10 = r2,r26 // current_counter - last_counter + ;; +(p6) sub r10 = r25,r26 // time we got was less than last_cycle +(p7) mov ar.ccv = r25 // more than last_cycle. Prep for cmpxchg + ;; + and r10 = r10,r14 // Apply mask + ;; + setf.sig f8 = r10 + nop.i 123 + ;; +(p7) cmpxchg8.rel r3 = [r23],r2,ar.ccv +EX(.fail_efault, probe.w.fault r31, 3) // This takes 5 cycles and we have spare time + xmpy.l f8 = f8,f7 // nsec_per_cyc*(counter-last_counter) +(p15) add r9 = r9,r17 // Add wall to monotonic.secs to result secs + ;; +(p15) ld8 r17 = [r19],-IA64_TIMESPEC_TV_NSEC_OFFSET +(p7) cmp.ne p7,p0 = r25,r3 // if cmpxchg not successful redo + // simulate tbit.nz.or p7,p0 = r28,0 + and r28 = ~1,r28 // Make sequence even to force retry if odd + getf.sig r2 = f8 + mf + add r8 = r8,r18 // Add time interpolator offset + ;; + ld4 r10 = [r29] // xtime_lock.sequence +(p15) add r8 = r8, r17 // Add monotonic.nsecs to nsecs + shr.u r2 = r2,r21 + ;; // overloaded 3 bundles! + // End critical section. + add r8 = r8,r2 // Add xtime.nsecs + cmp4.ne.or p7,p0 = r28,r10 +(p7) br.cond.dpnt.few .time_redo // sequence number changed ? + // Now r8=tv->tv_nsec and r9=tv->tv_sec + mov r10 = r0 + movl r2 = 1000000000 + add r23 = IA64_TIMESPEC_TV_NSEC_OFFSET, r31 +(p14) movl r3 = 2361183241434822607 // Prep for / 1000 hack + ;; +.time_normalize: + mov r21 = r8 + cmp.ge p6,p0 = r8,r2 +(p14) shr.u r20 = r8, 3 // We can repeat this if necessary just wasting some time + ;; +(p14) setf.sig f8 = r20 +(p6) sub r8 = r8,r2 +(p6) add r9 = 1,r9 // two nops before the branch. +(p14) setf.sig f7 = r3 // Chances for repeats are 1 in 10000 for gettod +(p6) br.cond.dpnt.few .time_normalize + ;; + // Divided by 8 though shift. Now divide by 125 + // The compiler was able to do that with a multiply + // and a shift and we do the same +EX(.fail_efault, probe.w.fault r23, 3) // This also costs 5 cycles +(p14) xmpy.hu f8 = f8, f7 // xmpy has 5 cycles latency so use it... + ;; + mov r8 = r0 +(p14) getf.sig r2 = f8 + ;; +(p14) shr.u r21 = r2, 4 + ;; +EX(.fail_efault, st8 [r31] = r9) +EX(.fail_efault, st8 [r23] = r21) + FSYS_RETURN +.fail_einval: + mov r8 = EINVAL + mov r10 = -1 + FSYS_RETURN +.fail_efault: + mov r8 = EFAULT + mov r10 = -1 + FSYS_RETURN +END(fsys_gettimeofday) + +ENTRY(fsys_clock_gettime) + .prologue + .altrp b6 + .body + cmp4.ltu p6, p0 = CLOCK_MONOTONIC, r32 + // Fallback if this is not CLOCK_REALTIME or CLOCK_MONOTONIC +(p6) br.spnt.few fsys_fallback_syscall + mov r31 = r33 + shl r30 = r32,15 + br.many .gettime +END(fsys_clock_gettime) + +/* + * long fsys_rt_sigprocmask (int how, sigset_t *set, sigset_t *oset, size_t sigsetsize). + */ +#if _NSIG_WORDS != 1 +# error Sorry, fsys_rt_sigprocmask() needs to be updated for _NSIG_WORDS != 1. +#endif +ENTRY(fsys_rt_sigprocmask) + .prologue + .altrp b6 + .body + + add r2=IA64_TASK_BLOCKED_OFFSET,r16 + add r9=TI_FLAGS+IA64_TASK_SIZE,r16 + cmp4.ltu p6,p0=SIG_SETMASK,r32 + + cmp.ne p15,p0=r0,r34 // oset != NULL? + tnat.nz p8,p0=r34 + add r31=IA64_TASK_SIGHAND_OFFSET,r16 + ;; + ld8 r3=[r2] // read/prefetch current->blocked + ld4 r9=[r9] + tnat.nz.or p6,p0=r35 + + cmp.ne.or p6,p0=_NSIG_WORDS*8,r35 + tnat.nz.or p6,p0=r32 +(p6) br.spnt.few .fail_einval // fail with EINVAL + ;; +#ifdef CONFIG_SMP + ld8 r31=[r31] // r31 <- current->sighand +#endif + and r9=TIF_ALLWORK_MASK,r9 + tnat.nz.or p8,p0=r33 + ;; + cmp.ne p7,p0=0,r9 + cmp.eq p6,p0=r0,r33 // set == NULL? + add r31=IA64_SIGHAND_SIGLOCK_OFFSET,r31 // r31 <- current->sighand->siglock +(p8) br.spnt.few .fail_efault // fail with EFAULT +(p7) br.spnt.many fsys_fallback_syscall // got pending kernel work... +(p6) br.dpnt.many .store_mask // -> short-circuit to just reading the signal mask + + /* Argh, we actually have to do some work and _update_ the signal mask: */ + +EX(.fail_efault, probe.r.fault r33, 3) // verify user has read-access to *set +EX(.fail_efault, ld8 r14=[r33]) // r14 <- *set + mov r17=(1 << (SIGKILL - 1)) | (1 << (SIGSTOP - 1)) + ;; + + rsm psr.i // mask interrupt delivery + mov ar.ccv=0 + andcm r14=r14,r17 // filter out SIGKILL & SIGSTOP + +#ifdef CONFIG_SMP + mov r17=1 + ;; + cmpxchg4.acq r18=[r31],r17,ar.ccv // try to acquire the lock + mov r8=EINVAL // default to EINVAL + ;; + ld8 r3=[r2] // re-read current->blocked now that we hold the lock + cmp4.ne p6,p0=r18,r0 +(p6) br.cond.spnt.many .lock_contention + ;; +#else + ld8 r3=[r2] // re-read current->blocked now that we hold the lock + mov r8=EINVAL // default to EINVAL +#endif + add r18=IA64_TASK_PENDING_OFFSET+IA64_SIGPENDING_SIGNAL_OFFSET,r16 + add r19=IA64_TASK_SIGNAL_OFFSET,r16 + cmp4.eq p6,p0=SIG_BLOCK,r32 + ;; + ld8 r19=[r19] // r19 <- current->signal + cmp4.eq p7,p0=SIG_UNBLOCK,r32 + cmp4.eq p8,p0=SIG_SETMASK,r32 + ;; + ld8 r18=[r18] // r18 <- current->pending.signal + .pred.rel.mutex p6,p7,p8 +(p6) or r14=r3,r14 // SIG_BLOCK +(p7) andcm r14=r3,r14 // SIG_UNBLOCK + +(p8) mov r14=r14 // SIG_SETMASK +(p6) mov r8=0 // clear error code + // recalc_sigpending() + add r17=IA64_SIGNAL_GROUP_STOP_COUNT_OFFSET,r19 + + add r19=IA64_SIGNAL_SHARED_PENDING_OFFSET+IA64_SIGPENDING_SIGNAL_OFFSET,r19 + ;; + ld4 r17=[r17] // r17 <- current->signal->group_stop_count +(p7) mov r8=0 // clear error code + + ld8 r19=[r19] // r19 <- current->signal->shared_pending + ;; + cmp4.gt p6,p7=r17,r0 // p6/p7 <- (current->signal->group_stop_count > 0)? +(p8) mov r8=0 // clear error code + + or r18=r18,r19 // r18 <- current->pending | current->signal->shared_pending + ;; + // r18 <- (current->pending | current->signal->shared_pending) & ~current->blocked: + andcm r18=r18,r14 + add r9=TI_FLAGS+IA64_TASK_SIZE,r16 + ;; + +(p7) cmp.ne.or.andcm p6,p7=r18,r0 // p6/p7 <- signal pending + mov r19=0 // i must not leak kernel bits... +(p6) br.cond.dpnt.many .sig_pending + ;; + +1: ld4 r17=[r9] // r17 <- current->thread_info->flags + ;; + mov ar.ccv=r17 + and r18=~_TIF_SIGPENDING,r17 // r18 <- r17 & ~(1 << TIF_SIGPENDING) + ;; + + st8 [r2]=r14 // update current->blocked with new mask + cmpxchg4.acq r14=[r9],r18,ar.ccv // current->thread_info->flags <- r18 + ;; + cmp.ne p6,p0=r17,r14 // update failed? +(p6) br.cond.spnt.few 1b // yes -> retry + +#ifdef CONFIG_SMP + st4.rel [r31]=r0 // release the lock +#endif + ssm psr.i + ;; + + srlz.d // ensure psr.i is set again + mov r18=0 // i must not leak kernel bits... + +.store_mask: +EX(.fail_efault, (p15) probe.w.fault r34, 3) // verify user has write-access to *oset +EX(.fail_efault, (p15) st8 [r34]=r3) + mov r2=0 // i must not leak kernel bits... + mov r3=0 // i must not leak kernel bits... + mov r8=0 // return 0 + mov r9=0 // i must not leak kernel bits... + mov r14=0 // i must not leak kernel bits... + mov r17=0 // i must not leak kernel bits... + mov r31=0 // i must not leak kernel bits... + FSYS_RETURN + +.sig_pending: +#ifdef CONFIG_SMP + st4.rel [r31]=r0 // release the lock +#endif + ssm psr.i + ;; + srlz.d + br.sptk.many fsys_fallback_syscall // with signal pending, do the heavy-weight syscall + +#ifdef CONFIG_SMP +.lock_contention: + /* Rather than spinning here, fall back on doing a heavy-weight syscall. */ + ssm psr.i + ;; + srlz.d + br.sptk.many fsys_fallback_syscall +#endif +END(fsys_rt_sigprocmask) + +ENTRY(fsys_fallback_syscall) + .prologue + .altrp b6 + .body + /* + * We only get here from light-weight syscall handlers. Thus, we already + * know that r15 contains a valid syscall number. No need to re-check. + */ + adds r17=-1024,r15 + movl r14=sys_call_table + ;; + rsm psr.i + shladd r18=r17,3,r14 + ;; + ld8 r18=[r18] // load normal (heavy-weight) syscall entry-point + mov r29=psr // read psr (12 cyc load latency) + mov r27=ar.rsc + mov r21=ar.fpsr + mov r26=ar.pfs +END(fsys_fallback_syscall) + /* FALL THROUGH */ +GLOBAL_ENTRY(fsys_bubble_down) + .prologue + .altrp b6 + .body + /* + * We get here for syscalls that don't have a lightweight handler. For those, we + * need to bubble down into the kernel and that requires setting up a minimal + * pt_regs structure, and initializing the CPU state more or less as if an + * interruption had occurred. To make syscall-restarts work, we setup pt_regs + * such that cr_iip points to the second instruction in syscall_via_break. + * Decrementing the IP hence will restart the syscall via break and not + * decrementing IP will return us to the caller, as usual. Note that we preserve + * the value of psr.pp rather than initializing it from dcr.pp. This makes it + * possible to distinguish fsyscall execution from other privileged execution. + * + * On entry: + * - normal fsyscall handler register usage, except that we also have: + * - r18: address of syscall entry point + * - r21: ar.fpsr + * - r26: ar.pfs + * - r27: ar.rsc + * - r29: psr + */ +# define PSR_PRESERVED_BITS (IA64_PSR_UP | IA64_PSR_MFL | IA64_PSR_MFH | IA64_PSR_PK \ + | IA64_PSR_DT | IA64_PSR_PP | IA64_PSR_SP | IA64_PSR_RT \ + | IA64_PSR_IC) + /* + * Reading psr.l gives us only bits 0-31, psr.it, and psr.mc. The rest we have + * to synthesize. + */ +# define PSR_ONE_BITS ((3 << IA64_PSR_CPL0_BIT) | (0x1 << IA64_PSR_RI_BIT) \ + | IA64_PSR_BN | IA64_PSR_I) + + invala + movl r8=PSR_ONE_BITS + + mov r25=ar.unat // save ar.unat (5 cyc) + movl r9=PSR_PRESERVED_BITS + + mov ar.rsc=0 // set enforced lazy mode, pl 0, little-endian, loadrs=0 + movl r28=__kernel_syscall_via_break + ;; + mov r23=ar.bspstore // save ar.bspstore (12 cyc) + mov r31=pr // save pr (2 cyc) + mov r20=r1 // save caller's gp in r20 + ;; + mov r2=r16 // copy current task addr to addl-addressable register + and r9=r9,r29 + mov r19=b6 // save b6 (2 cyc) + ;; + mov psr.l=r9 // slam the door (17 cyc to srlz.i) + or r29=r8,r29 // construct cr.ipsr value to save + addl r22=IA64_RBS_OFFSET,r2 // compute base of RBS + ;; + // GAS reports a spurious RAW hazard on the read of ar.rnat because it thinks + // we may be reading ar.itc after writing to psr.l. Avoid that message with + // this directive: + dv_serialize_data + mov.m r24=ar.rnat // read ar.rnat (5 cyc lat) + lfetch.fault.excl.nt1 [r22] + adds r16=IA64_TASK_THREAD_ON_USTACK_OFFSET,r2 + + // ensure previous insn group is issued before we stall for srlz.i: + ;; + srlz.i // ensure new psr.l has been established + ///////////////////////////////////////////////////////////////////////////// + ////////// from this point on, execution is not interruptible anymore + ///////////////////////////////////////////////////////////////////////////// + addl r1=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r2 // compute base of memory stack + cmp.ne pKStk,pUStk=r0,r0 // set pKStk <- 0, pUStk <- 1 + ;; + st1 [r16]=r0 // clear current->thread.on_ustack flag + mov ar.bspstore=r22 // switch to kernel RBS + mov b6=r18 // copy syscall entry-point to b6 (7 cyc) + add r3=TI_FLAGS+IA64_TASK_SIZE,r2 + ;; + ld4 r3=[r3] // r2 = current_thread_info()->flags + mov r18=ar.bsp // save (kernel) ar.bsp (12 cyc) + mov ar.rsc=0x3 // set eager mode, pl 0, little-endian, loadrs=0 + br.call.sptk.many b7=ia64_syscall_setup + ;; + ssm psr.i + movl r2=ia64_ret_from_syscall + ;; + mov rp=r2 // set the real return addr + tbit.z p8,p0=r3,TIF_SYSCALL_TRACE + ;; +(p10) br.cond.spnt.many ia64_ret_from_syscall // p10==true means out registers are more than 8 +(p8) br.call.sptk.many b6=b6 // ignore this return addr + br.cond.sptk ia64_trace_syscall +END(fsys_bubble_down) + + .rodata + .align 8 + .globl fsyscall_table + + data8 fsys_bubble_down +fsyscall_table: + data8 fsys_ni_syscall + data8 0 // exit // 1025 + data8 0 // read + data8 0 // write + data8 0 // open + data8 0 // close + data8 0 // creat // 1030 + data8 0 // link + data8 0 // unlink + data8 0 // execve + data8 0 // chdir + data8 0 // fchdir // 1035 + data8 0 // utimes + data8 0 // mknod + data8 0 // chmod + data8 0 // chown + data8 0 // lseek // 1040 + data8 fsys_getpid // getpid + data8 fsys_getppid // getppid + data8 0 // mount + data8 0 // umount + data8 0 // setuid // 1045 + data8 0 // getuid + data8 0 // geteuid + data8 0 // ptrace + data8 0 // access + data8 0 // sync // 1050 + data8 0 // fsync + data8 0 // fdatasync + data8 0 // kill + data8 0 // rename + data8 0 // mkdir // 1055 + data8 0 // rmdir + data8 0 // dup + data8 0 // pipe + data8 0 // times + data8 0 // brk // 1060 + data8 0 // setgid + data8 0 // getgid + data8 0 // getegid + data8 0 // acct + data8 0 // ioctl // 1065 + data8 0 // fcntl + data8 0 // umask + data8 0 // chroot + data8 0 // ustat + data8 0 // dup2 // 1070 + data8 0 // setreuid + data8 0 // setregid + data8 0 // getresuid + data8 0 // setresuid + data8 0 // getresgid // 1075 + data8 0 // setresgid + data8 0 // getgroups + data8 0 // setgroups + data8 0 // getpgid + data8 0 // setpgid // 1080 + data8 0 // setsid + data8 0 // getsid + data8 0 // sethostname + data8 0 // setrlimit + data8 0 // getrlimit // 1085 + data8 0 // getrusage + data8 fsys_gettimeofday // gettimeofday + data8 0 // settimeofday + data8 0 // select + data8 0 // poll // 1090 + data8 0 // symlink + data8 0 // readlink + data8 0 // uselib + data8 0 // swapon + data8 0 // swapoff // 1095 + data8 0 // reboot + data8 0 // truncate + data8 0 // ftruncate + data8 0 // fchmod + data8 0 // fchown // 1100 + data8 0 // getpriority + data8 0 // setpriority + data8 0 // statfs + data8 0 // fstatfs + data8 0 // gettid // 1105 + data8 0 // semget + data8 0 // semop + data8 0 // semctl + data8 0 // msgget + data8 0 // msgsnd // 1110 + data8 0 // msgrcv + data8 0 // msgctl + data8 0 // shmget + data8 0 // shmat + data8 0 // shmdt // 1115 + data8 0 // shmctl + data8 0 // syslog + data8 0 // setitimer + data8 0 // getitimer + data8 0 // 1120 + data8 0 + data8 0 + data8 0 // vhangup + data8 0 // lchown + data8 0 // remap_file_pages // 1125 + data8 0 // wait4 + data8 0 // sysinfo + data8 0 // clone + data8 0 // setdomainname + data8 0 // newuname // 1130 + data8 0 // adjtimex + data8 0 + data8 0 // init_module + data8 0 // delete_module + data8 0 // 1135 + data8 0 + data8 0 // quotactl + data8 0 // bdflush + data8 0 // sysfs + data8 0 // personality // 1140 + data8 0 // afs_syscall + data8 0 // setfsuid + data8 0 // setfsgid + data8 0 // getdents + data8 0 // flock // 1145 + data8 0 // readv + data8 0 // writev + data8 0 // pread64 + data8 0 // pwrite64 + data8 0 // sysctl // 1150 + data8 0 // mmap + data8 0 // munmap + data8 0 // mlock + data8 0 // mlockall + data8 0 // mprotect // 1155 + data8 0 // mremap + data8 0 // msync + data8 0 // munlock + data8 0 // munlockall + data8 0 // sched_getparam // 1160 + data8 0 // sched_setparam + data8 0 // sched_getscheduler + data8 0 // sched_setscheduler + data8 0 // sched_yield + data8 0 // sched_get_priority_max // 1165 + data8 0 // sched_get_priority_min + data8 0 // sched_rr_get_interval + data8 0 // nanosleep + data8 0 // nfsservctl + data8 0 // prctl // 1170 + data8 0 // getpagesize + data8 0 // mmap2 + data8 0 // pciconfig_read + data8 0 // pciconfig_write + data8 0 // perfmonctl // 1175 + data8 0 // sigaltstack + data8 0 // rt_sigaction + data8 0 // rt_sigpending + data8 fsys_rt_sigprocmask // rt_sigprocmask + data8 0 // rt_sigqueueinfo // 1180 + data8 0 // rt_sigreturn + data8 0 // rt_sigsuspend + data8 0 // rt_sigtimedwait + data8 0 // getcwd + data8 0 // capget // 1185 + data8 0 // capset + data8 0 // sendfile + data8 0 + data8 0 + data8 0 // socket // 1190 + data8 0 // bind + data8 0 // connect + data8 0 // listen + data8 0 // accept + data8 0 // getsockname // 1195 + data8 0 // getpeername + data8 0 // socketpair + data8 0 // send + data8 0 // sendto + data8 0 // recv // 1200 + data8 0 // recvfrom + data8 0 // shutdown + data8 0 // setsockopt + data8 0 // getsockopt + data8 0 // sendmsg // 1205 + data8 0 // recvmsg + data8 0 // pivot_root + data8 0 // mincore + data8 0 // madvise + data8 0 // newstat // 1210 + data8 0 // newlstat + data8 0 // newfstat + data8 0 // clone2 + data8 0 // getdents64 + data8 0 // getunwind // 1215 + data8 0 // readahead + data8 0 // setxattr + data8 0 // lsetxattr + data8 0 // fsetxattr + data8 0 // getxattr // 1220 + data8 0 // lgetxattr + data8 0 // fgetxattr + data8 0 // listxattr + data8 0 // llistxattr + data8 0 // flistxattr // 1225 + data8 0 // removexattr + data8 0 // lremovexattr + data8 0 // fremovexattr + data8 0 // tkill + data8 0 // futex // 1230 + data8 0 // sched_setaffinity + data8 0 // sched_getaffinity + data8 fsys_set_tid_address // set_tid_address + data8 0 // fadvise64_64 + data8 0 // tgkill // 1235 + data8 0 // exit_group + data8 0 // lookup_dcookie + data8 0 // io_setup + data8 0 // io_destroy + data8 0 // io_getevents // 1240 + data8 0 // io_submit + data8 0 // io_cancel + data8 0 // epoll_create + data8 0 // epoll_ctl + data8 0 // epoll_wait // 1245 + data8 0 // restart_syscall + data8 0 // semtimedop + data8 0 // timer_create + data8 0 // timer_settime + data8 0 // timer_gettime // 1250 + data8 0 // timer_getoverrun + data8 0 // timer_delete + data8 0 // clock_settime + data8 fsys_clock_gettime // clock_gettime + data8 0 // clock_getres // 1255 + data8 0 // clock_nanosleep + data8 0 // fstatfs64 + data8 0 // statfs64 + data8 0 + data8 0 // 1260 + data8 0 + data8 0 // mq_open + data8 0 // mq_unlink + data8 0 // mq_timedsend + data8 0 // mq_timedreceive // 1265 + data8 0 // mq_notify + data8 0 // mq_getsetattr + data8 0 // kexec_load + data8 0 + data8 0 // 1270 + data8 0 + data8 0 + data8 0 + data8 0 + data8 0 // 1275 + data8 0 + data8 0 + data8 0 + data8 0 + + .org fsyscall_table + 8*NR_syscalls // guard against failures to increase NR_syscalls diff --git a/arch/ia64/kernel/gate-data.S b/arch/ia64/kernel/gate-data.S new file mode 100644 index 000000000000..258c0a3238fb --- /dev/null +++ b/arch/ia64/kernel/gate-data.S @@ -0,0 +1,3 @@ + .section .data.gate, "aw" + + .incbin "arch/ia64/kernel/gate.so" diff --git a/arch/ia64/kernel/gate.S b/arch/ia64/kernel/gate.S new file mode 100644 index 000000000000..facf75acdc85 --- /dev/null +++ b/arch/ia64/kernel/gate.S @@ -0,0 +1,372 @@ +/* + * This file contains the code that gets mapped at the upper end of each task's text + * region. For now, it contains the signal trampoline code only. + * + * Copyright (C) 1999-2003 Hewlett-Packard Co + * David Mosberger-Tang <davidm@hpl.hp.com> + */ + +#include <linux/config.h> + +#include <asm/asmmacro.h> +#include <asm/errno.h> +#include <asm/offsets.h> +#include <asm/sigcontext.h> +#include <asm/system.h> +#include <asm/unistd.h> + +/* + * We can't easily refer to symbols inside the kernel. To avoid full runtime relocation, + * complications with the linker (which likes to create PLT stubs for branches + * to targets outside the shared object) and to avoid multi-phase kernel builds, we + * simply create minimalistic "patch lists" in special ELF sections. + */ + .section ".data.patch.fsyscall_table", "a" + .previous +#define LOAD_FSYSCALL_TABLE(reg) \ +[1:] movl reg=0; \ + .xdata4 ".data.patch.fsyscall_table", 1b-. + + .section ".data.patch.brl_fsys_bubble_down", "a" + .previous +#define BRL_COND_FSYS_BUBBLE_DOWN(pr) \ +[1:](pr)brl.cond.sptk 0; \ + .xdata4 ".data.patch.brl_fsys_bubble_down", 1b-. + +GLOBAL_ENTRY(__kernel_syscall_via_break) + .prologue + .altrp b6 + .body + /* + * Note: for (fast) syscall restart to work, the break instruction must be + * the first one in the bundle addressed by syscall_via_break. + */ +{ .mib + break 0x100000 + nop.i 0 + br.ret.sptk.many b6 +} +END(__kernel_syscall_via_break) + +/* + * On entry: + * r11 = saved ar.pfs + * r15 = system call # + * b0 = saved return address + * b6 = return address + * On exit: + * r11 = saved ar.pfs + * r15 = system call # + * b0 = saved return address + * all other "scratch" registers: undefined + * all "preserved" registers: same as on entry + */ + +GLOBAL_ENTRY(__kernel_syscall_via_epc) + .prologue + .altrp b6 + .body +{ + /* + * Note: the kernel cannot assume that the first two instructions in this + * bundle get executed. The remaining code must be safe even if + * they do not get executed. + */ + adds r17=-1024,r15 + mov r10=0 // default to successful syscall execution + epc +} + ;; + rsm psr.be // note: on McKinley "rsm psr.be/srlz.d" is slightly faster than "rum psr.be" + LOAD_FSYSCALL_TABLE(r14) + + mov r16=IA64_KR(CURRENT) // 12 cycle read latency + tnat.nz p10,p9=r15 + mov r19=NR_syscalls-1 + ;; + shladd r18=r17,3,r14 + + srlz.d + cmp.ne p8,p0=r0,r0 // p8 <- FALSE + /* Note: if r17 is a NaT, p6 will be set to zero. */ + cmp.geu p6,p7=r19,r17 // (syscall > 0 && syscall < 1024+NR_syscalls)? + ;; +(p6) ld8 r18=[r18] + mov r21=ar.fpsr + add r14=-8,r14 // r14 <- addr of fsys_bubble_down entry + ;; +(p6) mov b7=r18 +(p6) tbit.z p8,p0=r18,0 +(p8) br.dptk.many b7 + +(p6) rsm psr.i + mov r27=ar.rsc + mov r26=ar.pfs + ;; + mov r29=psr // read psr (12 cyc load latency) +/* + * brl.cond doesn't work as intended because the linker would convert this branch + * into a branch to a PLT. Perhaps there will be a way to avoid this with some + * future version of the linker. In the meantime, we just use an indirect branch + * instead. + */ +#ifdef CONFIG_ITANIUM +(p6) ld8 r14=[r14] // r14 <- fsys_bubble_down + ;; +(p6) mov b7=r14 +(p6) br.sptk.many b7 +#else + BRL_COND_FSYS_BUBBLE_DOWN(p6) +#endif + + mov r10=-1 +(p10) mov r8=EINVAL +(p9) mov r8=ENOSYS + FSYS_RETURN +END(__kernel_syscall_via_epc) + +# define ARG0_OFF (16 + IA64_SIGFRAME_ARG0_OFFSET) +# define ARG1_OFF (16 + IA64_SIGFRAME_ARG1_OFFSET) +# define ARG2_OFF (16 + IA64_SIGFRAME_ARG2_OFFSET) +# define SIGHANDLER_OFF (16 + IA64_SIGFRAME_HANDLER_OFFSET) +# define SIGCONTEXT_OFF (16 + IA64_SIGFRAME_SIGCONTEXT_OFFSET) + +# define FLAGS_OFF IA64_SIGCONTEXT_FLAGS_OFFSET +# define CFM_OFF IA64_SIGCONTEXT_CFM_OFFSET +# define FR6_OFF IA64_SIGCONTEXT_FR6_OFFSET +# define BSP_OFF IA64_SIGCONTEXT_AR_BSP_OFFSET +# define RNAT_OFF IA64_SIGCONTEXT_AR_RNAT_OFFSET +# define UNAT_OFF IA64_SIGCONTEXT_AR_UNAT_OFFSET +# define FPSR_OFF IA64_SIGCONTEXT_AR_FPSR_OFFSET +# define PR_OFF IA64_SIGCONTEXT_PR_OFFSET +# define RP_OFF IA64_SIGCONTEXT_IP_OFFSET +# define SP_OFF IA64_SIGCONTEXT_R12_OFFSET +# define RBS_BASE_OFF IA64_SIGCONTEXT_RBS_BASE_OFFSET +# define LOADRS_OFF IA64_SIGCONTEXT_LOADRS_OFFSET +# define base0 r2 +# define base1 r3 + /* + * When we get here, the memory stack looks like this: + * + * +===============================+ + * | | + * // struct sigframe // + * | | + * +-------------------------------+ <-- sp+16 + * | 16 byte of scratch | + * | space | + * +-------------------------------+ <-- sp + * + * The register stack looks _exactly_ the way it looked at the time the signal + * occurred. In other words, we're treading on a potential mine-field: each + * incoming general register may be a NaT value (including sp, in which case the + * process ends up dying with a SIGSEGV). + * + * The first thing need to do is a cover to get the registers onto the backing + * store. Once that is done, we invoke the signal handler which may modify some + * of the machine state. After returning from the signal handler, we return + * control to the previous context by executing a sigreturn system call. A signal + * handler may call the rt_sigreturn() function to directly return to a given + * sigcontext. However, the user-level sigreturn() needs to do much more than + * calling the rt_sigreturn() system call as it needs to unwind the stack to + * restore preserved registers that may have been saved on the signal handler's + * call stack. + */ + +#define SIGTRAMP_SAVES \ + .unwabi 3, 's'; /* mark this as a sigtramp handler (saves scratch regs) */ \ + .unwabi @svr4, 's'; /* backwards compatibility with old unwinders (remove in v2.7) */ \ + .savesp ar.unat, UNAT_OFF+SIGCONTEXT_OFF; \ + .savesp ar.fpsr, FPSR_OFF+SIGCONTEXT_OFF; \ + .savesp pr, PR_OFF+SIGCONTEXT_OFF; \ + .savesp rp, RP_OFF+SIGCONTEXT_OFF; \ + .savesp ar.pfs, CFM_OFF+SIGCONTEXT_OFF; \ + .vframesp SP_OFF+SIGCONTEXT_OFF + +GLOBAL_ENTRY(__kernel_sigtramp) + // describe the state that is active when we get here: + .prologue + SIGTRAMP_SAVES + .body + + .label_state 1 + + adds base0=SIGHANDLER_OFF,sp + adds base1=RBS_BASE_OFF+SIGCONTEXT_OFF,sp + br.call.sptk.many rp=1f +1: + ld8 r17=[base0],(ARG0_OFF-SIGHANDLER_OFF) // get pointer to signal handler's plabel + ld8 r15=[base1] // get address of new RBS base (or NULL) + cover // push args in interrupted frame onto backing store + ;; + cmp.ne p1,p0=r15,r0 // do we need to switch rbs? (note: pr is saved by kernel) + mov.m r9=ar.bsp // fetch ar.bsp + .spillsp.p p1, ar.rnat, RNAT_OFF+SIGCONTEXT_OFF +(p1) br.cond.spnt setup_rbs // yup -> (clobbers p8, r14-r16, and r18-r20) +back_from_setup_rbs: + alloc r8=ar.pfs,0,0,3,0 + ld8 out0=[base0],16 // load arg0 (signum) + adds base1=(ARG1_OFF-(RBS_BASE_OFF+SIGCONTEXT_OFF)),base1 + ;; + ld8 out1=[base1] // load arg1 (siginfop) + ld8 r10=[r17],8 // get signal handler entry point + ;; + ld8 out2=[base0] // load arg2 (sigcontextp) + ld8 gp=[r17] // get signal handler's global pointer + adds base0=(BSP_OFF+SIGCONTEXT_OFF),sp + ;; + .spillsp ar.bsp, BSP_OFF+SIGCONTEXT_OFF + st8 [base0]=r9 // save sc_ar_bsp + adds base0=(FR6_OFF+SIGCONTEXT_OFF),sp + adds base1=(FR6_OFF+16+SIGCONTEXT_OFF),sp + ;; + stf.spill [base0]=f6,32 + stf.spill [base1]=f7,32 + ;; + stf.spill [base0]=f8,32 + stf.spill [base1]=f9,32 + mov b6=r10 + ;; + stf.spill [base0]=f10,32 + stf.spill [base1]=f11,32 + ;; + stf.spill [base0]=f12,32 + stf.spill [base1]=f13,32 + ;; + stf.spill [base0]=f14,32 + stf.spill [base1]=f15,32 + br.call.sptk.many rp=b6 // call the signal handler +.ret0: adds base0=(BSP_OFF+SIGCONTEXT_OFF),sp + ;; + ld8 r15=[base0] // fetch sc_ar_bsp + mov r14=ar.bsp + ;; + cmp.ne p1,p0=r14,r15 // do we need to restore the rbs? +(p1) br.cond.spnt restore_rbs // yup -> (clobbers r14-r18, f6 & f7) + ;; +back_from_restore_rbs: + adds base0=(FR6_OFF+SIGCONTEXT_OFF),sp + adds base1=(FR6_OFF+16+SIGCONTEXT_OFF),sp + ;; + ldf.fill f6=[base0],32 + ldf.fill f7=[base1],32 + ;; + ldf.fill f8=[base0],32 + ldf.fill f9=[base1],32 + ;; + ldf.fill f10=[base0],32 + ldf.fill f11=[base1],32 + ;; + ldf.fill f12=[base0],32 + ldf.fill f13=[base1],32 + ;; + ldf.fill f14=[base0],32 + ldf.fill f15=[base1],32 + mov r15=__NR_rt_sigreturn + .restore sp // pop .prologue + break __BREAK_SYSCALL + + .prologue + SIGTRAMP_SAVES +setup_rbs: + mov ar.rsc=0 // put RSE into enforced lazy mode + ;; + .save ar.rnat, r19 + mov r19=ar.rnat // save RNaT before switching backing store area + adds r14=(RNAT_OFF+SIGCONTEXT_OFF),sp + + mov r18=ar.bspstore + mov ar.bspstore=r15 // switch over to new register backing store area + ;; + + .spillsp ar.rnat, RNAT_OFF+SIGCONTEXT_OFF + st8 [r14]=r19 // save sc_ar_rnat + .body + mov.m r16=ar.bsp // sc_loadrs <- (new bsp - new bspstore) << 16 + adds r14=(LOADRS_OFF+SIGCONTEXT_OFF),sp + ;; + invala + sub r15=r16,r15 + extr.u r20=r18,3,6 + ;; + mov ar.rsc=0xf // set RSE into eager mode, pl 3 + cmp.eq p8,p0=63,r20 + shl r15=r15,16 + ;; + st8 [r14]=r15 // save sc_loadrs +(p8) st8 [r18]=r19 // if bspstore points at RNaT slot, store RNaT there now + .restore sp // pop .prologue + br.cond.sptk back_from_setup_rbs + + .prologue + SIGTRAMP_SAVES + .spillsp ar.rnat, RNAT_OFF+SIGCONTEXT_OFF + .body +restore_rbs: + // On input: + // r14 = bsp1 (bsp at the time of return from signal handler) + // r15 = bsp0 (bsp at the time the signal occurred) + // + // Here, we need to calculate bspstore0, the value that ar.bspstore needs + // to be set to, based on bsp0 and the size of the dirty partition on + // the alternate stack (sc_loadrs >> 16). This can be done with the + // following algorithm: + // + // bspstore0 = rse_skip_regs(bsp0, -rse_num_regs(bsp1 - (loadrs >> 19), bsp1)); + // + // This is what the code below does. + // + alloc r2=ar.pfs,0,0,0,0 // alloc null frame + adds r16=(LOADRS_OFF+SIGCONTEXT_OFF),sp + adds r18=(RNAT_OFF+SIGCONTEXT_OFF),sp + ;; + ld8 r17=[r16] + ld8 r16=[r18] // get new rnat + extr.u r18=r15,3,6 // r18 <- rse_slot_num(bsp0) + ;; + mov ar.rsc=r17 // put RSE into enforced lazy mode + shr.u r17=r17,16 + ;; + sub r14=r14,r17 // r14 (bspstore1) <- bsp1 - (sc_loadrs >> 16) + shr.u r17=r17,3 // r17 <- (sc_loadrs >> 19) + ;; + loadrs // restore dirty partition + extr.u r14=r14,3,6 // r14 <- rse_slot_num(bspstore1) + ;; + add r14=r14,r17 // r14 <- rse_slot_num(bspstore1) + (sc_loadrs >> 19) + ;; + shr.u r14=r14,6 // r14 <- (rse_slot_num(bspstore1) + (sc_loadrs >> 19))/0x40 + ;; + sub r14=r14,r17 // r14 <- -rse_num_regs(bspstore1, bsp1) + movl r17=0x8208208208208209 + ;; + add r18=r18,r14 // r18 (delta) <- rse_slot_num(bsp0) - rse_num_regs(bspstore1,bsp1) + setf.sig f7=r17 + cmp.lt p7,p0=r14,r0 // p7 <- (r14 < 0)? + ;; +(p7) adds r18=-62,r18 // delta -= 62 + ;; + setf.sig f6=r18 + ;; + xmpy.h f6=f6,f7 + ;; + getf.sig r17=f6 + ;; + add r17=r17,r18 + shr r18=r18,63 + ;; + shr r17=r17,5 + ;; + sub r17=r17,r18 // r17 = delta/63 + ;; + add r17=r14,r17 // r17 <- delta/63 - rse_num_regs(bspstore1, bsp1) + ;; + shladd r15=r17,3,r15 // r15 <- bsp0 + 8*(delta/63 - rse_num_regs(bspstore1, bsp1)) + ;; + mov ar.bspstore=r15 // switch back to old register backing store area + ;; + mov ar.rnat=r16 // restore RNaT + mov ar.rsc=0xf // (will be restored later on from sc_ar_rsc) + // invala not necessary as that will happen when returning to user-mode + br.cond.sptk back_from_restore_rbs +END(__kernel_sigtramp) diff --git a/arch/ia64/kernel/gate.lds.S b/arch/ia64/kernel/gate.lds.S new file mode 100644 index 000000000000..e1e4aba9ecd0 --- /dev/null +++ b/arch/ia64/kernel/gate.lds.S @@ -0,0 +1,95 @@ +/* + * Linker script for gate DSO. The gate pages are an ELF shared object prelinked to its + * virtual address, with only one read-only segment and one execute-only segment (both fit + * in one page). This script controls its layout. + */ + +#include <linux/config.h> + +#include <asm/system.h> + +SECTIONS +{ + . = GATE_ADDR + SIZEOF_HEADERS; + + .hash : { *(.hash) } :readable + .dynsym : { *(.dynsym) } + .dynstr : { *(.dynstr) } + .gnu.version : { *(.gnu.version) } + .gnu.version_d : { *(.gnu.version_d) } + .gnu.version_r : { *(.gnu.version_r) } + .dynamic : { *(.dynamic) } :readable :dynamic + + /* + * This linker script is used both with -r and with -shared. For the layouts to match, + * we need to skip more than enough space for the dynamic symbol table et al. If this + * amount is insufficient, ld -shared will barf. Just increase it here. + */ + . = GATE_ADDR + 0x500; + + .data.patch : { + __start_gate_mckinley_e9_patchlist = .; + *(.data.patch.mckinley_e9) + __end_gate_mckinley_e9_patchlist = .; + + __start_gate_vtop_patchlist = .; + *(.data.patch.vtop) + __end_gate_vtop_patchlist = .; + + __start_gate_fsyscall_patchlist = .; + *(.data.patch.fsyscall_table) + __end_gate_fsyscall_patchlist = .; + + __start_gate_brl_fsys_bubble_down_patchlist = .; + *(.data.patch.brl_fsys_bubble_down) + __end_gate_brl_fsys_bubble_down_patchlist = .; + } :readable + .IA_64.unwind_info : { *(.IA_64.unwind_info*) } + .IA_64.unwind : { *(.IA_64.unwind*) } :readable :unwind +#ifdef HAVE_BUGGY_SEGREL + .text (GATE_ADDR + PAGE_SIZE) : { *(.text) *(.text.*) } :readable +#else + . = ALIGN (PERCPU_PAGE_SIZE) + (. & (PERCPU_PAGE_SIZE - 1)); + .text : { *(.text) *(.text.*) } :epc +#endif + + /DISCARD/ : { + *(.got.plt) *(.got) + *(.data .data.* .gnu.linkonce.d.*) + *(.dynbss) + *(.bss .bss.* .gnu.linkonce.b.*) + *(__ex_table) + } +} + +/* + * We must supply the ELF program headers explicitly to get just one + * PT_LOAD segment, and set the flags explicitly to make segments read-only. + */ +PHDRS +{ + readable PT_LOAD FILEHDR PHDRS FLAGS(4); /* PF_R */ +#ifndef HAVE_BUGGY_SEGREL + epc PT_LOAD FILEHDR PHDRS FLAGS(1); /* PF_X */ +#endif + dynamic PT_DYNAMIC FLAGS(4); /* PF_R */ + unwind 0x70000001; /* PT_IA_64_UNWIND, but ld doesn't match the name */ +} + +/* + * This controls what symbols we export from the DSO. + */ +VERSION +{ + LINUX_2.5 { + global: + __kernel_syscall_via_break; + __kernel_syscall_via_epc; + __kernel_sigtramp; + + local: *; + }; +} + +/* The ELF entry point can be used to set the AT_SYSINFO value. */ +ENTRY(__kernel_syscall_via_epc) diff --git a/arch/ia64/kernel/head.S b/arch/ia64/kernel/head.S new file mode 100644 index 000000000000..105c7fec8c6d --- /dev/null +++ b/arch/ia64/kernel/head.S @@ -0,0 +1,996 @@ +/* + * Here is where the ball gets rolling as far as the kernel is concerned. + * When control is transferred to _start, the bootload has already + * loaded us to the correct address. All that's left to do here is + * to set up the kernel's global pointer and jump to the kernel + * entry point. + * + * Copyright (C) 1998-2001, 2003, 2005 Hewlett-Packard Co + * David Mosberger-Tang <davidm@hpl.hp.com> + * Stephane Eranian <eranian@hpl.hp.com> + * Copyright (C) 1999 VA Linux Systems + * Copyright (C) 1999 Walt Drummond <drummond@valinux.com> + * Copyright (C) 1999 Intel Corp. + * Copyright (C) 1999 Asit Mallick <Asit.K.Mallick@intel.com> + * Copyright (C) 1999 Don Dugger <Don.Dugger@intel.com> + * Copyright (C) 2002 Fenghua Yu <fenghua.yu@intel.com> + * -Optimize __ia64_save_fpu() and __ia64_load_fpu() for Itanium 2. + */ + +#include <linux/config.h> + +#include <asm/asmmacro.h> +#include <asm/fpu.h> +#include <asm/kregs.h> +#include <asm/mmu_context.h> +#include <asm/offsets.h> +#include <asm/pal.h> +#include <asm/pgtable.h> +#include <asm/processor.h> +#include <asm/ptrace.h> +#include <asm/system.h> + + .section __special_page_section,"ax" + + .global empty_zero_page +empty_zero_page: + .skip PAGE_SIZE + + .global swapper_pg_dir +swapper_pg_dir: + .skip PAGE_SIZE + + .rodata +halt_msg: + stringz "Halting kernel\n" + + .text + + .global start_ap + + /* + * Start the kernel. When the bootloader passes control to _start(), r28 + * points to the address of the boot parameter area. Execution reaches + * here in physical mode. + */ +GLOBAL_ENTRY(_start) +start_ap: + .prologue + .save rp, r0 // terminate unwind chain with a NULL rp + .body + + rsm psr.i | psr.ic + ;; + srlz.i + ;; + /* + * Initialize kernel region registers: + * rr[0]: VHPT enabled, page size = PAGE_SHIFT + * rr[1]: VHPT enabled, page size = PAGE_SHIFT + * rr[2]: VHPT enabled, page size = PAGE_SHIFT + * rr[3]: VHPT enabled, page size = PAGE_SHIFT + * rr[4]: VHPT enabled, page size = PAGE_SHIFT + * rr[5]: VHPT enabled, page size = PAGE_SHIFT + * rr[6]: VHPT disabled, page size = IA64_GRANULE_SHIFT + * rr[7]: VHPT disabled, page size = IA64_GRANULE_SHIFT + * We initialize all of them to prevent inadvertently assuming + * something about the state of address translation early in boot. + */ + mov r6=((ia64_rid(IA64_REGION_ID_KERNEL, (0<<61)) << 8) | (PAGE_SHIFT << 2) | 1) + movl r7=(0<<61) + mov r8=((ia64_rid(IA64_REGION_ID_KERNEL, (1<<61)) << 8) | (PAGE_SHIFT << 2) | 1) + movl r9=(1<<61) + mov r10=((ia64_rid(IA64_REGION_ID_KERNEL, (2<<61)) << 8) | (PAGE_SHIFT << 2) | 1) + movl r11=(2<<61) + mov r12=((ia64_rid(IA64_REGION_ID_KERNEL, (3<<61)) << 8) | (PAGE_SHIFT << 2) | 1) + movl r13=(3<<61) + mov r14=((ia64_rid(IA64_REGION_ID_KERNEL, (4<<61)) << 8) | (PAGE_SHIFT << 2) | 1) + movl r15=(4<<61) + mov r16=((ia64_rid(IA64_REGION_ID_KERNEL, (5<<61)) << 8) | (PAGE_SHIFT << 2) | 1) + movl r17=(5<<61) + mov r18=((ia64_rid(IA64_REGION_ID_KERNEL, (6<<61)) << 8) | (IA64_GRANULE_SHIFT << 2)) + movl r19=(6<<61) + mov r20=((ia64_rid(IA64_REGION_ID_KERNEL, (7<<61)) << 8) | (IA64_GRANULE_SHIFT << 2)) + movl r21=(7<<61) + ;; + mov rr[r7]=r6 + mov rr[r9]=r8 + mov rr[r11]=r10 + mov rr[r13]=r12 + mov rr[r15]=r14 + mov rr[r17]=r16 + mov rr[r19]=r18 + mov rr[r21]=r20 + ;; + /* + * Now pin mappings into the TLB for kernel text and data + */ + mov r18=KERNEL_TR_PAGE_SHIFT<<2 + movl r17=KERNEL_START + ;; + mov cr.itir=r18 + mov cr.ifa=r17 + mov r16=IA64_TR_KERNEL + mov r3=ip + movl r18=PAGE_KERNEL + ;; + dep r2=0,r3,0,KERNEL_TR_PAGE_SHIFT + ;; + or r18=r2,r18 + ;; + srlz.i + ;; + itr.i itr[r16]=r18 + ;; + itr.d dtr[r16]=r18 + ;; + srlz.i + + /* + * Switch into virtual mode: + */ + movl r16=(IA64_PSR_IT|IA64_PSR_IC|IA64_PSR_DT|IA64_PSR_RT|IA64_PSR_DFH|IA64_PSR_BN \ + |IA64_PSR_DI) + ;; + mov cr.ipsr=r16 + movl r17=1f + ;; + mov cr.iip=r17 + mov cr.ifs=r0 + ;; + rfi + ;; +1: // now we are in virtual mode + + // set IVT entry point---can't access I/O ports without it + movl r3=ia64_ivt + ;; + mov cr.iva=r3 + movl r2=FPSR_DEFAULT + ;; + srlz.i + movl gp=__gp + + mov ar.fpsr=r2 + ;; + +#define isAP p2 // are we an Application Processor? +#define isBP p3 // are we the Bootstrap Processor? + +#ifdef CONFIG_SMP + /* + * Find the init_task for the currently booting CPU. At poweron, and in + * UP mode, task_for_booting_cpu is NULL. + */ + movl r3=task_for_booting_cpu + ;; + ld8 r3=[r3] + movl r2=init_task + ;; + cmp.eq isBP,isAP=r3,r0 + ;; +(isAP) mov r2=r3 +#else + movl r2=init_task + cmp.eq isBP,isAP=r0,r0 +#endif + ;; + tpa r3=r2 // r3 == phys addr of task struct + mov r16=-1 +(isBP) br.cond.dpnt .load_current // BP stack is on region 5 --- no need to map it + + // load mapping for stack (virtaddr in r2, physaddr in r3) + rsm psr.ic + movl r17=PAGE_KERNEL + ;; + srlz.d + dep r18=0,r3,0,12 + ;; + or r18=r17,r18 + dep r2=-1,r3,61,3 // IMVA of task + ;; + mov r17=rr[r2] + shr.u r16=r3,IA64_GRANULE_SHIFT + ;; + dep r17=0,r17,8,24 + ;; + mov cr.itir=r17 + mov cr.ifa=r2 + + mov r19=IA64_TR_CURRENT_STACK + ;; + itr.d dtr[r19]=r18 + ;; + ssm psr.ic + srlz.d + ;; + +.load_current: + // load the "current" pointer (r13) and ar.k6 with the current task + mov IA64_KR(CURRENT)=r2 // virtual address + mov IA64_KR(CURRENT_STACK)=r16 + mov r13=r2 + /* + * Reserve space at the top of the stack for "struct pt_regs". Kernel threads + * don't store interesting values in that structure, but the space still needs + * to be there because time-critical stuff such as the context switching can + * be implemented more efficiently (for example, __switch_to() + * always sets the psr.dfh bit of the task it is switching to). + */ + addl r12=IA64_STK_OFFSET-IA64_PT_REGS_SIZE-16,r2 + addl r2=IA64_RBS_OFFSET,r2 // initialize the RSE + mov ar.rsc=0 // place RSE in enforced lazy mode + ;; + loadrs // clear the dirty partition + ;; + mov ar.bspstore=r2 // establish the new RSE stack + ;; + mov ar.rsc=0x3 // place RSE in eager mode + +(isBP) dep r28=-1,r28,61,3 // make address virtual +(isBP) movl r2=ia64_boot_param + ;; +(isBP) st8 [r2]=r28 // save the address of the boot param area passed by the bootloader + +#ifdef CONFIG_SMP +(isAP) br.call.sptk.many rp=start_secondary +.ret0: +(isAP) br.cond.sptk self +#endif + + // This is executed by the bootstrap processor (bsp) only: + +#ifdef CONFIG_IA64_FW_EMU + // initialize PAL & SAL emulator: + br.call.sptk.many rp=sys_fw_init +.ret1: +#endif + br.call.sptk.many rp=start_kernel +.ret2: addl r3=@ltoff(halt_msg),gp + ;; + alloc r2=ar.pfs,8,0,2,0 + ;; + ld8 out0=[r3] + br.call.sptk.many b0=console_print + +self: hint @pause + br.sptk.many self // endless loop +END(_start) + +GLOBAL_ENTRY(ia64_save_debug_regs) + alloc r16=ar.pfs,1,0,0,0 + mov r20=ar.lc // preserve ar.lc + mov ar.lc=IA64_NUM_DBG_REGS-1 + mov r18=0 + add r19=IA64_NUM_DBG_REGS*8,in0 + ;; +1: mov r16=dbr[r18] +#ifdef CONFIG_ITANIUM + ;; + srlz.d +#endif + mov r17=ibr[r18] + add r18=1,r18 + ;; + st8.nta [in0]=r16,8 + st8.nta [r19]=r17,8 + br.cloop.sptk.many 1b + ;; + mov ar.lc=r20 // restore ar.lc + br.ret.sptk.many rp +END(ia64_save_debug_regs) + +GLOBAL_ENTRY(ia64_load_debug_regs) + alloc r16=ar.pfs,1,0,0,0 + lfetch.nta [in0] + mov r20=ar.lc // preserve ar.lc + add r19=IA64_NUM_DBG_REGS*8,in0 + mov ar.lc=IA64_NUM_DBG_REGS-1 + mov r18=-1 + ;; +1: ld8.nta r16=[in0],8 + ld8.nta r17=[r19],8 + add r18=1,r18 + ;; + mov dbr[r18]=r16 +#ifdef CONFIG_ITANIUM + ;; + srlz.d // Errata 132 (NoFix status) +#endif + mov ibr[r18]=r17 + br.cloop.sptk.many 1b + ;; + mov ar.lc=r20 // restore ar.lc + br.ret.sptk.many rp +END(ia64_load_debug_regs) + +GLOBAL_ENTRY(__ia64_save_fpu) + alloc r2=ar.pfs,1,4,0,0 + adds loc0=96*16-16,in0 + adds loc1=96*16-16-128,in0 + ;; + stf.spill.nta [loc0]=f127,-256 + stf.spill.nta [loc1]=f119,-256 + ;; + stf.spill.nta [loc0]=f111,-256 + stf.spill.nta [loc1]=f103,-256 + ;; + stf.spill.nta [loc0]=f95,-256 + stf.spill.nta [loc1]=f87,-256 + ;; + stf.spill.nta [loc0]=f79,-256 + stf.spill.nta [loc1]=f71,-256 + ;; + stf.spill.nta [loc0]=f63,-256 + stf.spill.nta [loc1]=f55,-256 + adds loc2=96*16-32,in0 + ;; + stf.spill.nta [loc0]=f47,-256 + stf.spill.nta [loc1]=f39,-256 + adds loc3=96*16-32-128,in0 + ;; + stf.spill.nta [loc2]=f126,-256 + stf.spill.nta [loc3]=f118,-256 + ;; + stf.spill.nta [loc2]=f110,-256 + stf.spill.nta [loc3]=f102,-256 + ;; + stf.spill.nta [loc2]=f94,-256 + stf.spill.nta [loc3]=f86,-256 + ;; + stf.spill.nta [loc2]=f78,-256 + stf.spill.nta [loc3]=f70,-256 + ;; + stf.spill.nta [loc2]=f62,-256 + stf.spill.nta [loc3]=f54,-256 + adds loc0=96*16-48,in0 + ;; + stf.spill.nta [loc2]=f46,-256 + stf.spill.nta [loc3]=f38,-256 + adds loc1=96*16-48-128,in0 + ;; + stf.spill.nta [loc0]=f125,-256 + stf.spill.nta [loc1]=f117,-256 + ;; + stf.spill.nta [loc0]=f109,-256 + stf.spill.nta [loc1]=f101,-256 + ;; + stf.spill.nta [loc0]=f93,-256 + stf.spill.nta [loc1]=f85,-256 + ;; + stf.spill.nta [loc0]=f77,-256 + stf.spill.nta [loc1]=f69,-256 + ;; + stf.spill.nta [loc0]=f61,-256 + stf.spill.nta [loc1]=f53,-256 + adds loc2=96*16-64,in0 + ;; + stf.spill.nta [loc0]=f45,-256 + stf.spill.nta [loc1]=f37,-256 + adds loc3=96*16-64-128,in0 + ;; + stf.spill.nta [loc2]=f124,-256 + stf.spill.nta [loc3]=f116,-256 + ;; + stf.spill.nta [loc2]=f108,-256 + stf.spill.nta [loc3]=f100,-256 + ;; + stf.spill.nta [loc2]=f92,-256 + stf.spill.nta [loc3]=f84,-256 + ;; + stf.spill.nta [loc2]=f76,-256 + stf.spill.nta [loc3]=f68,-256 + ;; + stf.spill.nta [loc2]=f60,-256 + stf.spill.nta [loc3]=f52,-256 + adds loc0=96*16-80,in0 + ;; + stf.spill.nta [loc2]=f44,-256 + stf.spill.nta [loc3]=f36,-256 + adds loc1=96*16-80-128,in0 + ;; + stf.spill.nta [loc0]=f123,-256 + stf.spill.nta [loc1]=f115,-256 + ;; + stf.spill.nta [loc0]=f107,-256 + stf.spill.nta [loc1]=f99,-256 + ;; + stf.spill.nta [loc0]=f91,-256 + stf.spill.nta [loc1]=f83,-256 + ;; + stf.spill.nta [loc0]=f75,-256 + stf.spill.nta [loc1]=f67,-256 + ;; + stf.spill.nta [loc0]=f59,-256 + stf.spill.nta [loc1]=f51,-256 + adds loc2=96*16-96,in0 + ;; + stf.spill.nta [loc0]=f43,-256 + stf.spill.nta [loc1]=f35,-256 + adds loc3=96*16-96-128,in0 + ;; + stf.spill.nta [loc2]=f122,-256 + stf.spill.nta [loc3]=f114,-256 + ;; + stf.spill.nta [loc2]=f106,-256 + stf.spill.nta [loc3]=f98,-256 + ;; + stf.spill.nta [loc2]=f90,-256 + stf.spill.nta [loc3]=f82,-256 + ;; + stf.spill.nta [loc2]=f74,-256 + stf.spill.nta [loc3]=f66,-256 + ;; + stf.spill.nta [loc2]=f58,-256 + stf.spill.nta [loc3]=f50,-256 + adds loc0=96*16-112,in0 + ;; + stf.spill.nta [loc2]=f42,-256 + stf.spill.nta [loc3]=f34,-256 + adds loc1=96*16-112-128,in0 + ;; + stf.spill.nta [loc0]=f121,-256 + stf.spill.nta [loc1]=f113,-256 + ;; + stf.spill.nta [loc0]=f105,-256 + stf.spill.nta [loc1]=f97,-256 + ;; + stf.spill.nta [loc0]=f89,-256 + stf.spill.nta [loc1]=f81,-256 + ;; + stf.spill.nta [loc0]=f73,-256 + stf.spill.nta [loc1]=f65,-256 + ;; + stf.spill.nta [loc0]=f57,-256 + stf.spill.nta [loc1]=f49,-256 + adds loc2=96*16-128,in0 + ;; + stf.spill.nta [loc0]=f41,-256 + stf.spill.nta [loc1]=f33,-256 + adds loc3=96*16-128-128,in0 + ;; + stf.spill.nta [loc2]=f120,-256 + stf.spill.nta [loc3]=f112,-256 + ;; + stf.spill.nta [loc2]=f104,-256 + stf.spill.nta [loc3]=f96,-256 + ;; + stf.spill.nta [loc2]=f88,-256 + stf.spill.nta [loc3]=f80,-256 + ;; + stf.spill.nta [loc2]=f72,-256 + stf.spill.nta [loc3]=f64,-256 + ;; + stf.spill.nta [loc2]=f56,-256 + stf.spill.nta [loc3]=f48,-256 + ;; + stf.spill.nta [loc2]=f40 + stf.spill.nta [loc3]=f32 + br.ret.sptk.many rp +END(__ia64_save_fpu) + +GLOBAL_ENTRY(__ia64_load_fpu) + alloc r2=ar.pfs,1,2,0,0 + adds r3=128,in0 + adds r14=256,in0 + adds r15=384,in0 + mov loc0=512 + mov loc1=-1024+16 + ;; + ldf.fill.nta f32=[in0],loc0 + ldf.fill.nta f40=[ r3],loc0 + ldf.fill.nta f48=[r14],loc0 + ldf.fill.nta f56=[r15],loc0 + ;; + ldf.fill.nta f64=[in0],loc0 + ldf.fill.nta f72=[ r3],loc0 + ldf.fill.nta f80=[r14],loc0 + ldf.fill.nta f88=[r15],loc0 + ;; + ldf.fill.nta f96=[in0],loc1 + ldf.fill.nta f104=[ r3],loc1 + ldf.fill.nta f112=[r14],loc1 + ldf.fill.nta f120=[r15],loc1 + ;; + ldf.fill.nta f33=[in0],loc0 + ldf.fill.nta f41=[ r3],loc0 + ldf.fill.nta f49=[r14],loc0 + ldf.fill.nta f57=[r15],loc0 + ;; + ldf.fill.nta f65=[in0],loc0 + ldf.fill.nta f73=[ r3],loc0 + ldf.fill.nta f81=[r14],loc0 + ldf.fill.nta f89=[r15],loc0 + ;; + ldf.fill.nta f97=[in0],loc1 + ldf.fill.nta f105=[ r3],loc1 + ldf.fill.nta f113=[r14],loc1 + ldf.fill.nta f121=[r15],loc1 + ;; + ldf.fill.nta f34=[in0],loc0 + ldf.fill.nta f42=[ r3],loc0 + ldf.fill.nta f50=[r14],loc0 + ldf.fill.nta f58=[r15],loc0 + ;; + ldf.fill.nta f66=[in0],loc0 + ldf.fill.nta f74=[ r3],loc0 + ldf.fill.nta f82=[r14],loc0 + ldf.fill.nta f90=[r15],loc0 + ;; + ldf.fill.nta f98=[in0],loc1 + ldf.fill.nta f106=[ r3],loc1 + ldf.fill.nta f114=[r14],loc1 + ldf.fill.nta f122=[r15],loc1 + ;; + ldf.fill.nta f35=[in0],loc0 + ldf.fill.nta f43=[ r3],loc0 + ldf.fill.nta f51=[r14],loc0 + ldf.fill.nta f59=[r15],loc0 + ;; + ldf.fill.nta f67=[in0],loc0 + ldf.fill.nta f75=[ r3],loc0 + ldf.fill.nta f83=[r14],loc0 + ldf.fill.nta f91=[r15],loc0 + ;; + ldf.fill.nta f99=[in0],loc1 + ldf.fill.nta f107=[ r3],loc1 + ldf.fill.nta f115=[r14],loc1 + ldf.fill.nta f123=[r15],loc1 + ;; + ldf.fill.nta f36=[in0],loc0 + ldf.fill.nta f44=[ r3],loc0 + ldf.fill.nta f52=[r14],loc0 + ldf.fill.nta f60=[r15],loc0 + ;; + ldf.fill.nta f68=[in0],loc0 + ldf.fill.nta f76=[ r3],loc0 + ldf.fill.nta f84=[r14],loc0 + ldf.fill.nta f92=[r15],loc0 + ;; + ldf.fill.nta f100=[in0],loc1 + ldf.fill.nta f108=[ r3],loc1 + ldf.fill.nta f116=[r14],loc1 + ldf.fill.nta f124=[r15],loc1 + ;; + ldf.fill.nta f37=[in0],loc0 + ldf.fill.nta f45=[ r3],loc0 + ldf.fill.nta f53=[r14],loc0 + ldf.fill.nta f61=[r15],loc0 + ;; + ldf.fill.nta f69=[in0],loc0 + ldf.fill.nta f77=[ r3],loc0 + ldf.fill.nta f85=[r14],loc0 + ldf.fill.nta f93=[r15],loc0 + ;; + ldf.fill.nta f101=[in0],loc1 + ldf.fill.nta f109=[ r3],loc1 + ldf.fill.nta f117=[r14],loc1 + ldf.fill.nta f125=[r15],loc1 + ;; + ldf.fill.nta f38 =[in0],loc0 + ldf.fill.nta f46 =[ r3],loc0 + ldf.fill.nta f54 =[r14],loc0 + ldf.fill.nta f62 =[r15],loc0 + ;; + ldf.fill.nta f70 =[in0],loc0 + ldf.fill.nta f78 =[ r3],loc0 + ldf.fill.nta f86 =[r14],loc0 + ldf.fill.nta f94 =[r15],loc0 + ;; + ldf.fill.nta f102=[in0],loc1 + ldf.fill.nta f110=[ r3],loc1 + ldf.fill.nta f118=[r14],loc1 + ldf.fill.nta f126=[r15],loc1 + ;; + ldf.fill.nta f39 =[in0],loc0 + ldf.fill.nta f47 =[ r3],loc0 + ldf.fill.nta f55 =[r14],loc0 + ldf.fill.nta f63 =[r15],loc0 + ;; + ldf.fill.nta f71 =[in0],loc0 + ldf.fill.nta f79 =[ r3],loc0 + ldf.fill.nta f87 =[r14],loc0 + ldf.fill.nta f95 =[r15],loc0 + ;; + ldf.fill.nta f103=[in0] + ldf.fill.nta f111=[ r3] + ldf.fill.nta f119=[r14] + ldf.fill.nta f127=[r15] + br.ret.sptk.many rp +END(__ia64_load_fpu) + +GLOBAL_ENTRY(__ia64_init_fpu) + stf.spill [sp]=f0 // M3 + mov f32=f0 // F + nop.b 0 + + ldfps f33,f34=[sp] // M0 + ldfps f35,f36=[sp] // M1 + mov f37=f0 // F + ;; + + setf.s f38=r0 // M2 + setf.s f39=r0 // M3 + mov f40=f0 // F + + ldfps f41,f42=[sp] // M0 + ldfps f43,f44=[sp] // M1 + mov f45=f0 // F + + setf.s f46=r0 // M2 + setf.s f47=r0 // M3 + mov f48=f0 // F + + ldfps f49,f50=[sp] // M0 + ldfps f51,f52=[sp] // M1 + mov f53=f0 // F + + setf.s f54=r0 // M2 + setf.s f55=r0 // M3 + mov f56=f0 // F + + ldfps f57,f58=[sp] // M0 + ldfps f59,f60=[sp] // M1 + mov f61=f0 // F + + setf.s f62=r0 // M2 + setf.s f63=r0 // M3 + mov f64=f0 // F + + ldfps f65,f66=[sp] // M0 + ldfps f67,f68=[sp] // M1 + mov f69=f0 // F + + setf.s f70=r0 // M2 + setf.s f71=r0 // M3 + mov f72=f0 // F + + ldfps f73,f74=[sp] // M0 + ldfps f75,f76=[sp] // M1 + mov f77=f0 // F + + setf.s f78=r0 // M2 + setf.s f79=r0 // M3 + mov f80=f0 // F + + ldfps f81,f82=[sp] // M0 + ldfps f83,f84=[sp] // M1 + mov f85=f0 // F + + setf.s f86=r0 // M2 + setf.s f87=r0 // M3 + mov f88=f0 // F + + /* + * When the instructions are cached, it would be faster to initialize + * the remaining registers with simply mov instructions (F-unit). + * This gets the time down to ~29 cycles. However, this would use up + * 33 bundles, whereas continuing with the above pattern yields + * 10 bundles and ~30 cycles. + */ + + ldfps f89,f90=[sp] // M0 + ldfps f91,f92=[sp] // M1 + mov f93=f0 // F + + setf.s f94=r0 // M2 + setf.s f95=r0 // M3 + mov f96=f0 // F + + ldfps f97,f98=[sp] // M0 + ldfps f99,f100=[sp] // M1 + mov f101=f0 // F + + setf.s f102=r0 // M2 + setf.s f103=r0 // M3 + mov f104=f0 // F + + ldfps f105,f106=[sp] // M0 + ldfps f107,f108=[sp] // M1 + mov f109=f0 // F + + setf.s f110=r0 // M2 + setf.s f111=r0 // M3 + mov f112=f0 // F + + ldfps f113,f114=[sp] // M0 + ldfps f115,f116=[sp] // M1 + mov f117=f0 // F + + setf.s f118=r0 // M2 + setf.s f119=r0 // M3 + mov f120=f0 // F + + ldfps f121,f122=[sp] // M0 + ldfps f123,f124=[sp] // M1 + mov f125=f0 // F + + setf.s f126=r0 // M2 + setf.s f127=r0 // M3 + br.ret.sptk.many rp // F +END(__ia64_init_fpu) + +/* + * Switch execution mode from virtual to physical + * + * Inputs: + * r16 = new psr to establish + * Output: + * r19 = old virtual address of ar.bsp + * r20 = old virtual address of sp + * + * Note: RSE must already be in enforced lazy mode + */ +GLOBAL_ENTRY(ia64_switch_mode_phys) + { + alloc r2=ar.pfs,0,0,0,0 + rsm psr.i | psr.ic // disable interrupts and interrupt collection + mov r15=ip + } + ;; + { + flushrs // must be first insn in group + srlz.i + } + ;; + mov cr.ipsr=r16 // set new PSR + add r3=1f-ia64_switch_mode_phys,r15 + + mov r19=ar.bsp + mov r20=sp + mov r14=rp // get return address into a general register + ;; + + // going to physical mode, use tpa to translate virt->phys + tpa r17=r19 + tpa r3=r3 + tpa sp=sp + tpa r14=r14 + ;; + + mov r18=ar.rnat // save ar.rnat + mov ar.bspstore=r17 // this steps on ar.rnat + mov cr.iip=r3 + mov cr.ifs=r0 + ;; + mov ar.rnat=r18 // restore ar.rnat + rfi // must be last insn in group + ;; +1: mov rp=r14 + br.ret.sptk.many rp +END(ia64_switch_mode_phys) + +/* + * Switch execution mode from physical to virtual + * + * Inputs: + * r16 = new psr to establish + * r19 = new bspstore to establish + * r20 = new sp to establish + * + * Note: RSE must already be in enforced lazy mode + */ +GLOBAL_ENTRY(ia64_switch_mode_virt) + { + alloc r2=ar.pfs,0,0,0,0 + rsm psr.i | psr.ic // disable interrupts and interrupt collection + mov r15=ip + } + ;; + { + flushrs // must be first insn in group + srlz.i + } + ;; + mov cr.ipsr=r16 // set new PSR + add r3=1f-ia64_switch_mode_virt,r15 + + mov r14=rp // get return address into a general register + ;; + + // going to virtual + // - for code addresses, set upper bits of addr to KERNEL_START + // - for stack addresses, copy from input argument + movl r18=KERNEL_START + dep r3=0,r3,KERNEL_TR_PAGE_SHIFT,64-KERNEL_TR_PAGE_SHIFT + dep r14=0,r14,KERNEL_TR_PAGE_SHIFT,64-KERNEL_TR_PAGE_SHIFT + mov sp=r20 + ;; + or r3=r3,r18 + or r14=r14,r18 + ;; + + mov r18=ar.rnat // save ar.rnat + mov ar.bspstore=r19 // this steps on ar.rnat + mov cr.iip=r3 + mov cr.ifs=r0 + ;; + mov ar.rnat=r18 // restore ar.rnat + rfi // must be last insn in group + ;; +1: mov rp=r14 + br.ret.sptk.many rp +END(ia64_switch_mode_virt) + +GLOBAL_ENTRY(ia64_delay_loop) + .prologue +{ nop 0 // work around GAS unwind info generation bug... + .save ar.lc,r2 + mov r2=ar.lc + .body + ;; + mov ar.lc=r32 +} + ;; + // force loop to be 32-byte aligned (GAS bug means we cannot use .align + // inside function body without corrupting unwind info). +{ nop 0 } +1: br.cloop.sptk.few 1b + ;; + mov ar.lc=r2 + br.ret.sptk.many rp +END(ia64_delay_loop) + +/* + * Return a CPU-local timestamp in nano-seconds. This timestamp is + * NOT synchronized across CPUs its return value must never be + * compared against the values returned on another CPU. The usage in + * kernel/sched.c ensures that. + * + * The return-value of sched_clock() is NOT supposed to wrap-around. + * If it did, it would cause some scheduling hiccups (at the worst). + * Fortunately, with a 64-bit cycle-counter ticking at 100GHz, even + * that would happen only once every 5+ years. + * + * The code below basically calculates: + * + * (ia64_get_itc() * local_cpu_data->nsec_per_cyc) >> IA64_NSEC_PER_CYC_SHIFT + * + * except that the multiplication and the shift are done with 128-bit + * intermediate precision so that we can produce a full 64-bit result. + */ +GLOBAL_ENTRY(sched_clock) + addl r8=THIS_CPU(cpu_info) + IA64_CPUINFO_NSEC_PER_CYC_OFFSET,r0 + mov.m r9=ar.itc // fetch cycle-counter (35 cyc) + ;; + ldf8 f8=[r8] + ;; + setf.sig f9=r9 // certain to stall, so issue it _after_ ldf8... + ;; + xmpy.lu f10=f9,f8 // calculate low 64 bits of 128-bit product (4 cyc) + xmpy.hu f11=f9,f8 // calculate high 64 bits of 128-bit product + ;; + getf.sig r8=f10 // (5 cyc) + getf.sig r9=f11 + ;; + shrp r8=r9,r8,IA64_NSEC_PER_CYC_SHIFT + br.ret.sptk.many rp +END(sched_clock) + +GLOBAL_ENTRY(start_kernel_thread) + .prologue + .save rp, r0 // this is the end of the call-chain + .body + alloc r2 = ar.pfs, 0, 0, 2, 0 + mov out0 = r9 + mov out1 = r11;; + br.call.sptk.many rp = kernel_thread_helper;; + mov out0 = r8 + br.call.sptk.many rp = sys_exit;; +1: br.sptk.few 1b // not reached +END(start_kernel_thread) + +#ifdef CONFIG_IA64_BRL_EMU + +/* + * Assembly routines used by brl_emu.c to set preserved register state. + */ + +#define SET_REG(reg) \ + GLOBAL_ENTRY(ia64_set_##reg); \ + alloc r16=ar.pfs,1,0,0,0; \ + mov reg=r32; \ + ;; \ + br.ret.sptk.many rp; \ + END(ia64_set_##reg) + +SET_REG(b1); +SET_REG(b2); +SET_REG(b3); +SET_REG(b4); +SET_REG(b5); + +#endif /* CONFIG_IA64_BRL_EMU */ + +#ifdef CONFIG_SMP + /* + * This routine handles spinlock contention. It uses a non-standard calling + * convention to avoid converting leaf routines into interior routines. Because + * of this special convention, there are several restrictions: + * + * - do not use gp relative variables, this code is called from the kernel + * and from modules, r1 is undefined. + * - do not use stacked registers, the caller owns them. + * - do not use the scratch stack space, the caller owns it. + * - do not use any registers other than the ones listed below + * + * Inputs: + * ar.pfs - saved CFM of caller + * ar.ccv - 0 (and available for use) + * r27 - flags from spin_lock_irqsave or 0. Must be preserved. + * r28 - available for use. + * r29 - available for use. + * r30 - available for use. + * r31 - address of lock, available for use. + * b6 - return address + * p14 - available for use. + * p15 - used to track flag status. + * + * If you patch this code to use more registers, do not forget to update + * the clobber lists for spin_lock() in include/asm-ia64/spinlock.h. + */ + +#if __GNUC__ < 3 || (__GNUC__ == 3 && __GNUC_MINOR__ < 3) + +GLOBAL_ENTRY(ia64_spinlock_contention_pre3_4) + .prologue + .save ar.pfs, r0 // this code effectively has a zero frame size + .save rp, r28 + .body + nop 0 + tbit.nz p15,p0=r27,IA64_PSR_I_BIT + .restore sp // pop existing prologue after next insn + mov b6 = r28 + .prologue + .save ar.pfs, r0 + .altrp b6 + .body + ;; +(p15) ssm psr.i // reenable interrupts if they were on + // DavidM says that srlz.d is slow and is not required in this case +.wait: + // exponential backoff, kdb, lockmeter etc. go in here + hint @pause + ld4 r30=[r31] // don't use ld4.bias; if it's contended, we won't write the word + nop 0 + ;; + cmp4.ne p14,p0=r30,r0 +(p14) br.cond.sptk.few .wait +(p15) rsm psr.i // disable interrupts if we reenabled them + br.cond.sptk.few b6 // lock is now free, try to acquire + .global ia64_spinlock_contention_pre3_4_end // for kernprof +ia64_spinlock_contention_pre3_4_end: +END(ia64_spinlock_contention_pre3_4) + +#else + +GLOBAL_ENTRY(ia64_spinlock_contention) + .prologue + .altrp b6 + .body + tbit.nz p15,p0=r27,IA64_PSR_I_BIT + ;; +.wait: +(p15) ssm psr.i // reenable interrupts if they were on + // DavidM says that srlz.d is slow and is not required in this case +.wait2: + // exponential backoff, kdb, lockmeter etc. go in here + hint @pause + ld4 r30=[r31] // don't use ld4.bias; if it's contended, we won't write the word + ;; + cmp4.ne p14,p0=r30,r0 + mov r30 = 1 +(p14) br.cond.sptk.few .wait2 +(p15) rsm psr.i // disable interrupts if we reenabled them + ;; + cmpxchg4.acq r30=[r31], r30, ar.ccv + ;; + cmp4.ne p14,p0=r0,r30 +(p14) br.cond.sptk.few .wait + + br.ret.sptk.many b6 // lock is now taken +END(ia64_spinlock_contention) + +#endif + +#endif /* CONFIG_SMP */ diff --git a/arch/ia64/kernel/ia64_ksyms.c b/arch/ia64/kernel/ia64_ksyms.c new file mode 100644 index 000000000000..7bbf019c9867 --- /dev/null +++ b/arch/ia64/kernel/ia64_ksyms.c @@ -0,0 +1,127 @@ +/* + * Architecture-specific kernel symbols + * + * Don't put any exports here unless it's defined in an assembler file. + * All other exports should be put directly after the definition. + */ + +#include <linux/config.h> +#include <linux/module.h> + +#include <linux/string.h> +EXPORT_SYMBOL(memset); +EXPORT_SYMBOL(memchr); +EXPORT_SYMBOL(memcmp); +EXPORT_SYMBOL(memcpy); +EXPORT_SYMBOL(memmove); +EXPORT_SYMBOL(memscan); +EXPORT_SYMBOL(strcat); +EXPORT_SYMBOL(strchr); +EXPORT_SYMBOL(strcmp); +EXPORT_SYMBOL(strcpy); +EXPORT_SYMBOL(strlen); +EXPORT_SYMBOL(strncat); +EXPORT_SYMBOL(strncmp); +EXPORT_SYMBOL(strncpy); +EXPORT_SYMBOL(strnlen); +EXPORT_SYMBOL(strrchr); +EXPORT_SYMBOL(strstr); +EXPORT_SYMBOL(strpbrk); + +#include <asm/checksum.h> +EXPORT_SYMBOL(ip_fast_csum); /* hand-coded assembly */ + +#include <asm/semaphore.h> +EXPORT_SYMBOL(__down); +EXPORT_SYMBOL(__down_interruptible); +EXPORT_SYMBOL(__down_trylock); +EXPORT_SYMBOL(__up); + +#include <asm/page.h> +EXPORT_SYMBOL(clear_page); + +#ifdef CONFIG_VIRTUAL_MEM_MAP +#include <linux/bootmem.h> +EXPORT_SYMBOL(max_low_pfn); /* defined by bootmem.c, but not exported by generic code */ +#endif + +#include <asm/processor.h> +EXPORT_SYMBOL(per_cpu__cpu_info); +#ifdef CONFIG_SMP +EXPORT_SYMBOL(per_cpu__local_per_cpu_offset); +#endif + +#include <asm/uaccess.h> +EXPORT_SYMBOL(__copy_user); +EXPORT_SYMBOL(__do_clear_user); +EXPORT_SYMBOL(__strlen_user); +EXPORT_SYMBOL(__strncpy_from_user); +EXPORT_SYMBOL(__strnlen_user); + +#include <asm/unistd.h> +EXPORT_SYMBOL(__ia64_syscall); + +/* from arch/ia64/lib */ +extern void __divsi3(void); +extern void __udivsi3(void); +extern void __modsi3(void); +extern void __umodsi3(void); +extern void __divdi3(void); +extern void __udivdi3(void); +extern void __moddi3(void); +extern void __umoddi3(void); + +EXPORT_SYMBOL(__divsi3); +EXPORT_SYMBOL(__udivsi3); +EXPORT_SYMBOL(__modsi3); +EXPORT_SYMBOL(__umodsi3); +EXPORT_SYMBOL(__divdi3); +EXPORT_SYMBOL(__udivdi3); +EXPORT_SYMBOL(__moddi3); +EXPORT_SYMBOL(__umoddi3); + +#if defined(CONFIG_MD_RAID5) || defined(CONFIG_MD_RAID5_MODULE) +extern void xor_ia64_2(void); +extern void xor_ia64_3(void); +extern void xor_ia64_4(void); +extern void xor_ia64_5(void); + +EXPORT_SYMBOL(xor_ia64_2); +EXPORT_SYMBOL(xor_ia64_3); +EXPORT_SYMBOL(xor_ia64_4); +EXPORT_SYMBOL(xor_ia64_5); +#endif + +#include <asm/pal.h> +EXPORT_SYMBOL(ia64_pal_call_phys_stacked); +EXPORT_SYMBOL(ia64_pal_call_phys_static); +EXPORT_SYMBOL(ia64_pal_call_stacked); +EXPORT_SYMBOL(ia64_pal_call_static); +EXPORT_SYMBOL(ia64_load_scratch_fpregs); +EXPORT_SYMBOL(ia64_save_scratch_fpregs); + +#include <asm/unwind.h> +EXPORT_SYMBOL(unw_init_running); + +#ifdef ASM_SUPPORTED +# ifdef CONFIG_SMP +# if __GNUC__ < 3 || (__GNUC__ == 3 && __GNUC_MINOR__ < 3) +/* + * This is not a normal routine and we don't want a function descriptor for it, so we use + * a fake declaration here. + */ +extern char ia64_spinlock_contention_pre3_4; +EXPORT_SYMBOL(ia64_spinlock_contention_pre3_4); +# else +/* + * This is not a normal routine and we don't want a function descriptor for it, so we use + * a fake declaration here. + */ +extern char ia64_spinlock_contention; +EXPORT_SYMBOL(ia64_spinlock_contention); +# endif +# endif +#endif + +extern char ia64_ivt[]; +EXPORT_SYMBOL(ia64_ivt); diff --git a/arch/ia64/kernel/init_task.c b/arch/ia64/kernel/init_task.c new file mode 100644 index 000000000000..b69c397ed1bf --- /dev/null +++ b/arch/ia64/kernel/init_task.c @@ -0,0 +1,46 @@ +/* + * This is where we statically allocate and initialize the initial + * task. + * + * Copyright (C) 1999, 2002-2003 Hewlett-Packard Co + * David Mosberger-Tang <davidm@hpl.hp.com> + */ + +#include <linux/init.h> +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/sched.h> +#include <linux/init_task.h> +#include <linux/mqueue.h> + +#include <asm/uaccess.h> +#include <asm/pgtable.h> + +static struct fs_struct init_fs = INIT_FS; +static struct files_struct init_files = INIT_FILES; +static struct signal_struct init_signals = INIT_SIGNALS(init_signals); +static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); +struct mm_struct init_mm = INIT_MM(init_mm); + +EXPORT_SYMBOL(init_mm); + +/* + * Initial task structure. + * + * We need to make sure that this is properly aligned due to the way process stacks are + * handled. This is done by having a special ".data.init_task" section... + */ +#define init_thread_info init_task_mem.s.thread_info + +union { + struct { + struct task_struct task; + struct thread_info thread_info; + } s; + unsigned long stack[KERNEL_STACK_SIZE/sizeof (unsigned long)]; +} init_task_mem asm ("init_task") __attribute__((section(".data.init_task"))) = {{ + .task = INIT_TASK(init_task_mem.s.task), + .thread_info = INIT_THREAD_INFO(init_task_mem.s.task) +}}; + +EXPORT_SYMBOL(init_task); diff --git a/arch/ia64/kernel/iosapic.c b/arch/ia64/kernel/iosapic.c new file mode 100644 index 000000000000..c15be5c38f56 --- /dev/null +++ b/arch/ia64/kernel/iosapic.c @@ -0,0 +1,827 @@ +/* + * I/O SAPIC support. + * + * Copyright (C) 1999 Intel Corp. + * Copyright (C) 1999 Asit Mallick <asit.k.mallick@intel.com> + * Copyright (C) 2000-2002 J.I. Lee <jung-ik.lee@intel.com> + * Copyright (C) 1999-2000, 2002-2003 Hewlett-Packard Co. + * David Mosberger-Tang <davidm@hpl.hp.com> + * Copyright (C) 1999 VA Linux Systems + * Copyright (C) 1999,2000 Walt Drummond <drummond@valinux.com> + * + * 00/04/19 D. Mosberger Rewritten to mirror more closely the x86 I/O APIC code. + * In particular, we now have separate handlers for edge + * and level triggered interrupts. + * 00/10/27 Asit Mallick, Goutham Rao <goutham.rao@intel.com> IRQ vector allocation + * PCI to vector mapping, shared PCI interrupts. + * 00/10/27 D. Mosberger Document things a bit more to make them more understandable. + * Clean up much of the old IOSAPIC cruft. + * 01/07/27 J.I. Lee PCI irq routing, Platform/Legacy interrupts and fixes for + * ACPI S5(SoftOff) support. + * 02/01/23 J.I. Lee iosapic pgm fixes for PCI irq routing from _PRT + * 02/01/07 E. Focht <efocht@ess.nec.de> Redirectable interrupt vectors in + * iosapic_set_affinity(), initializations for + * /proc/irq/#/smp_affinity + * 02/04/02 P. Diefenbaugh Cleaned up ACPI PCI IRQ routing. + * 02/04/18 J.I. Lee bug fix in iosapic_init_pci_irq + * 02/04/30 J.I. Lee bug fix in find_iosapic to fix ACPI PCI IRQ to IOSAPIC mapping + * error + * 02/07/29 T. Kochi Allocate interrupt vectors dynamically + * 02/08/04 T. Kochi Cleaned up terminology (irq, global system interrupt, vector, etc.) + * 02/09/20 D. Mosberger Simplified by taking advantage of ACPI's pci_irq code. + * 03/02/19 B. Helgaas Make pcat_compat system-wide, not per-IOSAPIC. + * Remove iosapic_address & gsi_base from external interfaces. + * Rationalize __init/__devinit attributes. + * 04/12/04 Ashok Raj <ashok.raj@intel.com> Intel Corporation 2004 + * Updated to work with irq migration necessary for CPU Hotplug + */ +/* + * Here is what the interrupt logic between a PCI device and the kernel looks like: + * + * (1) A PCI device raises one of the four interrupt pins (INTA, INTB, INTC, INTD). The + * device is uniquely identified by its bus--, and slot-number (the function + * number does not matter here because all functions share the same interrupt + * lines). + * + * (2) The motherboard routes the interrupt line to a pin on a IOSAPIC controller. + * Multiple interrupt lines may have to share the same IOSAPIC pin (if they're level + * triggered and use the same polarity). Each interrupt line has a unique Global + * System Interrupt (GSI) number which can be calculated as the sum of the controller's + * base GSI number and the IOSAPIC pin number to which the line connects. + * + * (3) The IOSAPIC uses an internal routing table entries (RTEs) to map the IOSAPIC pin + * into the IA-64 interrupt vector. This interrupt vector is then sent to the CPU. + * + * (4) The kernel recognizes an interrupt as an IRQ. The IRQ interface is used as + * architecture-independent interrupt handling mechanism in Linux. As an + * IRQ is a number, we have to have IA-64 interrupt vector number <-> IRQ number + * mapping. On smaller systems, we use one-to-one mapping between IA-64 vector and + * IRQ. A platform can implement platform_irq_to_vector(irq) and + * platform_local_vector_to_irq(vector) APIs to differentiate the mapping. + * Please see also include/asm-ia64/hw_irq.h for those APIs. + * + * To sum up, there are three levels of mappings involved: + * + * PCI pin -> global system interrupt (GSI) -> IA-64 vector <-> IRQ + * + * Note: The term "IRQ" is loosely used everywhere in Linux kernel to describe interrupts. + * Now we use "IRQ" only for Linux IRQ's. ISA IRQ (isa_irq) is the only exception in this + * source code. + */ +#include <linux/config.h> + +#include <linux/acpi.h> +#include <linux/init.h> +#include <linux/irq.h> +#include <linux/kernel.h> +#include <linux/list.h> +#include <linux/pci.h> +#include <linux/smp.h> +#include <linux/smp_lock.h> +#include <linux/string.h> + +#include <asm/delay.h> +#include <asm/hw_irq.h> +#include <asm/io.h> +#include <asm/iosapic.h> +#include <asm/machvec.h> +#include <asm/processor.h> +#include <asm/ptrace.h> +#include <asm/system.h> + + +#undef DEBUG_INTERRUPT_ROUTING + +#ifdef DEBUG_INTERRUPT_ROUTING +#define DBG(fmt...) printk(fmt) +#else +#define DBG(fmt...) +#endif + +static DEFINE_SPINLOCK(iosapic_lock); + +/* These tables map IA-64 vectors to the IOSAPIC pin that generates this vector. */ + +static struct iosapic_intr_info { + char __iomem *addr; /* base address of IOSAPIC */ + u32 low32; /* current value of low word of Redirection table entry */ + unsigned int gsi_base; /* first GSI assigned to this IOSAPIC */ + char rte_index; /* IOSAPIC RTE index (-1 => not an IOSAPIC interrupt) */ + unsigned char dmode : 3; /* delivery mode (see iosapic.h) */ + unsigned char polarity: 1; /* interrupt polarity (see iosapic.h) */ + unsigned char trigger : 1; /* trigger mode (see iosapic.h) */ + int refcnt; /* reference counter */ +} iosapic_intr_info[IA64_NUM_VECTORS]; + +static struct iosapic { + char __iomem *addr; /* base address of IOSAPIC */ + unsigned int gsi_base; /* first GSI assigned to this IOSAPIC */ + unsigned short num_rte; /* number of RTE in this IOSAPIC */ +#ifdef CONFIG_NUMA + unsigned short node; /* numa node association via pxm */ +#endif +} iosapic_lists[NR_IOSAPICS]; + +static int num_iosapic; + +static unsigned char pcat_compat __initdata; /* 8259 compatibility flag */ + + +/* + * Find an IOSAPIC associated with a GSI + */ +static inline int +find_iosapic (unsigned int gsi) +{ + int i; + + for (i = 0; i < num_iosapic; i++) { + if ((unsigned) (gsi - iosapic_lists[i].gsi_base) < iosapic_lists[i].num_rte) + return i; + } + + return -1; +} + +static inline int +_gsi_to_vector (unsigned int gsi) +{ + struct iosapic_intr_info *info; + + for (info = iosapic_intr_info; info < iosapic_intr_info + IA64_NUM_VECTORS; ++info) + if (info->gsi_base + info->rte_index == gsi) + return info - iosapic_intr_info; + return -1; +} + +/* + * Translate GSI number to the corresponding IA-64 interrupt vector. If no + * entry exists, return -1. + */ +inline int +gsi_to_vector (unsigned int gsi) +{ + return _gsi_to_vector(gsi); +} + +int +gsi_to_irq (unsigned int gsi) +{ + /* + * XXX fix me: this assumes an identity mapping vetween IA-64 vector and Linux irq + * numbers... + */ + return _gsi_to_vector(gsi); +} + +static void +set_rte (unsigned int vector, unsigned int dest, int mask) +{ + unsigned long pol, trigger, dmode; + u32 low32, high32; + char __iomem *addr; + int rte_index; + char redir; + + DBG(KERN_DEBUG"IOSAPIC: routing vector %d to 0x%x\n", vector, dest); + + rte_index = iosapic_intr_info[vector].rte_index; + if (rte_index < 0) + return; /* not an IOSAPIC interrupt */ + + addr = iosapic_intr_info[vector].addr; + pol = iosapic_intr_info[vector].polarity; + trigger = iosapic_intr_info[vector].trigger; + dmode = iosapic_intr_info[vector].dmode; + vector &= (~IA64_IRQ_REDIRECTED); + + redir = (dmode == IOSAPIC_LOWEST_PRIORITY) ? 1 : 0; + +#ifdef CONFIG_SMP + { + unsigned int irq; + + for (irq = 0; irq < NR_IRQS; ++irq) + if (irq_to_vector(irq) == vector) { + set_irq_affinity_info(irq, (int)(dest & 0xffff), redir); + break; + } + } +#endif + + low32 = ((pol << IOSAPIC_POLARITY_SHIFT) | + (trigger << IOSAPIC_TRIGGER_SHIFT) | + (dmode << IOSAPIC_DELIVERY_SHIFT) | + ((mask ? 1 : 0) << IOSAPIC_MASK_SHIFT) | + vector); + + /* dest contains both id and eid */ + high32 = (dest << IOSAPIC_DEST_SHIFT); + + iosapic_write(addr, IOSAPIC_RTE_HIGH(rte_index), high32); + iosapic_write(addr, IOSAPIC_RTE_LOW(rte_index), low32); + iosapic_intr_info[vector].low32 = low32; +} + +static void +nop (unsigned int vector) +{ + /* do nothing... */ +} + +static void +mask_irq (unsigned int irq) +{ + unsigned long flags; + char __iomem *addr; + u32 low32; + int rte_index; + ia64_vector vec = irq_to_vector(irq); + + addr = iosapic_intr_info[vec].addr; + rte_index = iosapic_intr_info[vec].rte_index; + + if (rte_index < 0) + return; /* not an IOSAPIC interrupt! */ + + spin_lock_irqsave(&iosapic_lock, flags); + { + /* set only the mask bit */ + low32 = iosapic_intr_info[vec].low32 |= IOSAPIC_MASK; + iosapic_write(addr, IOSAPIC_RTE_LOW(rte_index), low32); + } + spin_unlock_irqrestore(&iosapic_lock, flags); +} + +static void +unmask_irq (unsigned int irq) +{ + unsigned long flags; + char __iomem *addr; + u32 low32; + int rte_index; + ia64_vector vec = irq_to_vector(irq); + + addr = iosapic_intr_info[vec].addr; + rte_index = iosapic_intr_info[vec].rte_index; + if (rte_index < 0) + return; /* not an IOSAPIC interrupt! */ + + spin_lock_irqsave(&iosapic_lock, flags); + { + low32 = iosapic_intr_info[vec].low32 &= ~IOSAPIC_MASK; + iosapic_write(addr, IOSAPIC_RTE_LOW(rte_index), low32); + } + spin_unlock_irqrestore(&iosapic_lock, flags); +} + + +static void +iosapic_set_affinity (unsigned int irq, cpumask_t mask) +{ +#ifdef CONFIG_SMP + unsigned long flags; + u32 high32, low32; + int dest, rte_index; + char __iomem *addr; + int redir = (irq & IA64_IRQ_REDIRECTED) ? 1 : 0; + ia64_vector vec; + + irq &= (~IA64_IRQ_REDIRECTED); + vec = irq_to_vector(irq); + + if (cpus_empty(mask)) + return; + + dest = cpu_physical_id(first_cpu(mask)); + + rte_index = iosapic_intr_info[vec].rte_index; + addr = iosapic_intr_info[vec].addr; + + if (rte_index < 0) + return; /* not an IOSAPIC interrupt */ + + set_irq_affinity_info(irq, dest, redir); + + /* dest contains both id and eid */ + high32 = dest << IOSAPIC_DEST_SHIFT; + + spin_lock_irqsave(&iosapic_lock, flags); + { + low32 = iosapic_intr_info[vec].low32 & ~(7 << IOSAPIC_DELIVERY_SHIFT); + + if (redir) + /* change delivery mode to lowest priority */ + low32 |= (IOSAPIC_LOWEST_PRIORITY << IOSAPIC_DELIVERY_SHIFT); + else + /* change delivery mode to fixed */ + low32 |= (IOSAPIC_FIXED << IOSAPIC_DELIVERY_SHIFT); + + iosapic_intr_info[vec].low32 = low32; + iosapic_write(addr, IOSAPIC_RTE_HIGH(rte_index), high32); + iosapic_write(addr, IOSAPIC_RTE_LOW(rte_index), low32); + } + spin_unlock_irqrestore(&iosapic_lock, flags); +#endif +} + +/* + * Handlers for level-triggered interrupts. + */ + +static unsigned int +iosapic_startup_level_irq (unsigned int irq) +{ + unmask_irq(irq); + return 0; +} + +static void +iosapic_end_level_irq (unsigned int irq) +{ + ia64_vector vec = irq_to_vector(irq); + + move_irq(irq); + iosapic_eoi(iosapic_intr_info[vec].addr, vec); +} + +#define iosapic_shutdown_level_irq mask_irq +#define iosapic_enable_level_irq unmask_irq +#define iosapic_disable_level_irq mask_irq +#define iosapic_ack_level_irq nop + +struct hw_interrupt_type irq_type_iosapic_level = { + .typename = "IO-SAPIC-level", + .startup = iosapic_startup_level_irq, + .shutdown = iosapic_shutdown_level_irq, + .enable = iosapic_enable_level_irq, + .disable = iosapic_disable_level_irq, + .ack = iosapic_ack_level_irq, + .end = iosapic_end_level_irq, + .set_affinity = iosapic_set_affinity +}; + +/* + * Handlers for edge-triggered interrupts. + */ + +static unsigned int +iosapic_startup_edge_irq (unsigned int irq) +{ + unmask_irq(irq); + /* + * IOSAPIC simply drops interrupts pended while the + * corresponding pin was masked, so we can't know if an + * interrupt is pending already. Let's hope not... + */ + return 0; +} + +static void +iosapic_ack_edge_irq (unsigned int irq) +{ + irq_desc_t *idesc = irq_descp(irq); + + move_irq(irq); + /* + * Once we have recorded IRQ_PENDING already, we can mask the + * interrupt for real. This prevents IRQ storms from unhandled + * devices. + */ + if ((idesc->status & (IRQ_PENDING|IRQ_DISABLED)) == (IRQ_PENDING|IRQ_DISABLED)) + mask_irq(irq); +} + +#define iosapic_enable_edge_irq unmask_irq +#define iosapic_disable_edge_irq nop +#define iosapic_end_edge_irq nop + +struct hw_interrupt_type irq_type_iosapic_edge = { + .typename = "IO-SAPIC-edge", + .startup = iosapic_startup_edge_irq, + .shutdown = iosapic_disable_edge_irq, + .enable = iosapic_enable_edge_irq, + .disable = iosapic_disable_edge_irq, + .ack = iosapic_ack_edge_irq, + .end = iosapic_end_edge_irq, + .set_affinity = iosapic_set_affinity +}; + +unsigned int +iosapic_version (char __iomem *addr) +{ + /* + * IOSAPIC Version Register return 32 bit structure like: + * { + * unsigned int version : 8; + * unsigned int reserved1 : 8; + * unsigned int max_redir : 8; + * unsigned int reserved2 : 8; + * } + */ + return iosapic_read(addr, IOSAPIC_VERSION); +} + +/* + * if the given vector is already owned by other, + * assign a new vector for the other and make the vector available + */ +static void __init +iosapic_reassign_vector (int vector) +{ + int new_vector; + + if (iosapic_intr_info[vector].rte_index >= 0 || iosapic_intr_info[vector].addr + || iosapic_intr_info[vector].gsi_base || iosapic_intr_info[vector].dmode + || iosapic_intr_info[vector].polarity || iosapic_intr_info[vector].trigger) + { + new_vector = assign_irq_vector(AUTO_ASSIGN); + printk(KERN_INFO "Reassigning vector %d to %d\n", vector, new_vector); + memcpy(&iosapic_intr_info[new_vector], &iosapic_intr_info[vector], + sizeof(struct iosapic_intr_info)); + memset(&iosapic_intr_info[vector], 0, sizeof(struct iosapic_intr_info)); + iosapic_intr_info[vector].rte_index = -1; + } +} + +static void +register_intr (unsigned int gsi, int vector, unsigned char delivery, + unsigned long polarity, unsigned long trigger) +{ + irq_desc_t *idesc; + struct hw_interrupt_type *irq_type; + int rte_index; + int index; + unsigned long gsi_base; + void __iomem *iosapic_address; + + index = find_iosapic(gsi); + if (index < 0) { + printk(KERN_WARNING "%s: No IOSAPIC for GSI %u\n", __FUNCTION__, gsi); + return; + } + + iosapic_address = iosapic_lists[index].addr; + gsi_base = iosapic_lists[index].gsi_base; + + rte_index = gsi - gsi_base; + iosapic_intr_info[vector].rte_index = rte_index; + iosapic_intr_info[vector].polarity = polarity; + iosapic_intr_info[vector].dmode = delivery; + iosapic_intr_info[vector].addr = iosapic_address; + iosapic_intr_info[vector].gsi_base = gsi_base; + iosapic_intr_info[vector].trigger = trigger; + iosapic_intr_info[vector].refcnt++; + + if (trigger == IOSAPIC_EDGE) + irq_type = &irq_type_iosapic_edge; + else + irq_type = &irq_type_iosapic_level; + + idesc = irq_descp(vector); + if (idesc->handler != irq_type) { + if (idesc->handler != &no_irq_type) + printk(KERN_WARNING "%s: changing vector %d from %s to %s\n", + __FUNCTION__, vector, idesc->handler->typename, irq_type->typename); + idesc->handler = irq_type; + } +} + +static unsigned int +get_target_cpu (unsigned int gsi, int vector) +{ +#ifdef CONFIG_SMP + static int cpu = -1; + + /* + * If the platform supports redirection via XTP, let it + * distribute interrupts. + */ + if (smp_int_redirect & SMP_IRQ_REDIRECTION) + return cpu_physical_id(smp_processor_id()); + + /* + * Some interrupts (ACPI SCI, for instance) are registered + * before the BSP is marked as online. + */ + if (!cpu_online(smp_processor_id())) + return cpu_physical_id(smp_processor_id()); + +#ifdef CONFIG_NUMA + { + int num_cpus, cpu_index, iosapic_index, numa_cpu, i = 0; + cpumask_t cpu_mask; + + iosapic_index = find_iosapic(gsi); + if (iosapic_index < 0 || + iosapic_lists[iosapic_index].node == MAX_NUMNODES) + goto skip_numa_setup; + + cpu_mask = node_to_cpumask(iosapic_lists[iosapic_index].node); + + for_each_cpu_mask(numa_cpu, cpu_mask) { + if (!cpu_online(numa_cpu)) + cpu_clear(numa_cpu, cpu_mask); + } + + num_cpus = cpus_weight(cpu_mask); + + if (!num_cpus) + goto skip_numa_setup; + + /* Use vector assigment to distribute across cpus in node */ + cpu_index = vector % num_cpus; + + for (numa_cpu = first_cpu(cpu_mask) ; i < cpu_index ; i++) + numa_cpu = next_cpu(numa_cpu, cpu_mask); + + if (numa_cpu != NR_CPUS) + return cpu_physical_id(numa_cpu); + } +skip_numa_setup: +#endif + /* + * Otherwise, round-robin interrupt vectors across all the + * processors. (It'd be nice if we could be smarter in the + * case of NUMA.) + */ + do { + if (++cpu >= NR_CPUS) + cpu = 0; + } while (!cpu_online(cpu)); + + return cpu_physical_id(cpu); +#else + return cpu_physical_id(smp_processor_id()); +#endif +} + +/* + * ACPI can describe IOSAPIC interrupts via static tables and namespace + * methods. This provides an interface to register those interrupts and + * program the IOSAPIC RTE. + */ +int +iosapic_register_intr (unsigned int gsi, + unsigned long polarity, unsigned long trigger) +{ + int vector; + unsigned int dest; + unsigned long flags; + + /* + * If this GSI has already been registered (i.e., it's a + * shared interrupt, or we lost a race to register it), + * don't touch the RTE. + */ + spin_lock_irqsave(&iosapic_lock, flags); + { + vector = gsi_to_vector(gsi); + if (vector > 0) { + iosapic_intr_info[vector].refcnt++; + spin_unlock_irqrestore(&iosapic_lock, flags); + return vector; + } + + vector = assign_irq_vector(AUTO_ASSIGN); + dest = get_target_cpu(gsi, vector); + register_intr(gsi, vector, IOSAPIC_LOWEST_PRIORITY, + polarity, trigger); + + set_rte(vector, dest, 1); + } + spin_unlock_irqrestore(&iosapic_lock, flags); + + printk(KERN_INFO "GSI %u (%s, %s) -> CPU %d (0x%04x) vector %d\n", + gsi, (trigger == IOSAPIC_EDGE ? "edge" : "level"), + (polarity == IOSAPIC_POL_HIGH ? "high" : "low"), + cpu_logical_id(dest), dest, vector); + + return vector; +} + +#ifdef CONFIG_ACPI_DEALLOCATE_IRQ +void +iosapic_unregister_intr (unsigned int gsi) +{ + unsigned long flags; + int irq, vector; + irq_desc_t *idesc; + int rte_index; + unsigned long trigger, polarity; + + /* + * If the irq associated with the gsi is not found, + * iosapic_unregister_intr() is unbalanced. We need to check + * this again after getting locks. + */ + irq = gsi_to_irq(gsi); + if (irq < 0) { + printk(KERN_ERR "iosapic_unregister_intr(%u) unbalanced\n", gsi); + WARN_ON(1); + return; + } + vector = irq_to_vector(irq); + + idesc = irq_descp(irq); + spin_lock_irqsave(&idesc->lock, flags); + spin_lock(&iosapic_lock); + { + rte_index = iosapic_intr_info[vector].rte_index; + if (rte_index < 0) { + spin_unlock(&iosapic_lock); + spin_unlock_irqrestore(&idesc->lock, flags); + printk(KERN_ERR "iosapic_unregister_intr(%u) unbalanced\n", gsi); + WARN_ON(1); + return; + } + + if (--iosapic_intr_info[vector].refcnt > 0) { + spin_unlock(&iosapic_lock); + spin_unlock_irqrestore(&idesc->lock, flags); + return; + } + + /* + * If interrupt handlers still exist on the irq + * associated with the gsi, don't unregister the + * interrupt. + */ + if (idesc->action) { + iosapic_intr_info[vector].refcnt++; + spin_unlock(&iosapic_lock); + spin_unlock_irqrestore(&idesc->lock, flags); + printk(KERN_WARNING "Cannot unregister GSI. IRQ %u is still in use.\n", irq); + return; + } + + /* Clear the interrupt controller descriptor. */ + idesc->handler = &no_irq_type; + + trigger = iosapic_intr_info[vector].trigger; + polarity = iosapic_intr_info[vector].polarity; + + /* Clear the interrupt information. */ + memset(&iosapic_intr_info[vector], 0, sizeof(struct iosapic_intr_info)); + iosapic_intr_info[vector].rte_index = -1; /* mark as unused */ + } + spin_unlock(&iosapic_lock); + spin_unlock_irqrestore(&idesc->lock, flags); + + /* Free the interrupt vector */ + free_irq_vector(vector); + + printk(KERN_INFO "GSI %u (%s, %s) -> vector %d unregisterd.\n", + gsi, (trigger == IOSAPIC_EDGE ? "edge" : "level"), + (polarity == IOSAPIC_POL_HIGH ? "high" : "low"), + vector); +} +#endif /* CONFIG_ACPI_DEALLOCATE_IRQ */ + +/* + * ACPI calls this when it finds an entry for a platform interrupt. + * Note that the irq_base and IOSAPIC address must be set in iosapic_init(). + */ +int __init +iosapic_register_platform_intr (u32 int_type, unsigned int gsi, + int iosapic_vector, u16 eid, u16 id, + unsigned long polarity, unsigned long trigger) +{ + static const char * const name[] = {"unknown", "PMI", "INIT", "CPEI"}; + unsigned char delivery; + int vector, mask = 0; + unsigned int dest = ((id << 8) | eid) & 0xffff; + + switch (int_type) { + case ACPI_INTERRUPT_PMI: + vector = iosapic_vector; + /* + * since PMI vector is alloc'd by FW(ACPI) not by kernel, + * we need to make sure the vector is available + */ + iosapic_reassign_vector(vector); + delivery = IOSAPIC_PMI; + break; + case ACPI_INTERRUPT_INIT: + vector = assign_irq_vector(AUTO_ASSIGN); + delivery = IOSAPIC_INIT; + break; + case ACPI_INTERRUPT_CPEI: + vector = IA64_CPE_VECTOR; + delivery = IOSAPIC_LOWEST_PRIORITY; + mask = 1; + break; + default: + printk(KERN_ERR "iosapic_register_platform_irq(): invalid int type 0x%x\n", int_type); + return -1; + } + + register_intr(gsi, vector, delivery, polarity, trigger); + + printk(KERN_INFO "PLATFORM int %s (0x%x): GSI %u (%s, %s) -> CPU %d (0x%04x) vector %d\n", + int_type < ARRAY_SIZE(name) ? name[int_type] : "unknown", + int_type, gsi, (trigger == IOSAPIC_EDGE ? "edge" : "level"), + (polarity == IOSAPIC_POL_HIGH ? "high" : "low"), + cpu_logical_id(dest), dest, vector); + + set_rte(vector, dest, mask); + return vector; +} + + +/* + * ACPI calls this when it finds an entry for a legacy ISA IRQ override. + * Note that the gsi_base and IOSAPIC address must be set in iosapic_init(). + */ +void __init +iosapic_override_isa_irq (unsigned int isa_irq, unsigned int gsi, + unsigned long polarity, + unsigned long trigger) +{ + int vector; + unsigned int dest = cpu_physical_id(smp_processor_id()); + + vector = isa_irq_to_vector(isa_irq); + + register_intr(gsi, vector, IOSAPIC_LOWEST_PRIORITY, polarity, trigger); + + DBG("ISA: IRQ %u -> GSI %u (%s,%s) -> CPU %d (0x%04x) vector %d\n", + isa_irq, gsi, trigger == IOSAPIC_EDGE ? "edge" : "level", + polarity == IOSAPIC_POL_HIGH ? "high" : "low", + cpu_logical_id(dest), dest, vector); + + set_rte(vector, dest, 1); +} + +void __init +iosapic_system_init (int system_pcat_compat) +{ + int vector; + + for (vector = 0; vector < IA64_NUM_VECTORS; ++vector) + iosapic_intr_info[vector].rte_index = -1; /* mark as unused */ + + pcat_compat = system_pcat_compat; + if (pcat_compat) { + /* + * Disable the compatibility mode interrupts (8259 style), needs IN/OUT support + * enabled. + */ + printk(KERN_INFO "%s: Disabling PC-AT compatible 8259 interrupts\n", __FUNCTION__); + outb(0xff, 0xA1); + outb(0xff, 0x21); + } +} + +void __init +iosapic_init (unsigned long phys_addr, unsigned int gsi_base) +{ + int num_rte; + unsigned int isa_irq, ver; + char __iomem *addr; + + addr = ioremap(phys_addr, 0); + ver = iosapic_version(addr); + + /* + * The MAX_REDIR register holds the highest input pin + * number (starting from 0). + * We add 1 so that we can use it for number of pins (= RTEs) + */ + num_rte = ((ver >> 16) & 0xff) + 1; + + iosapic_lists[num_iosapic].addr = addr; + iosapic_lists[num_iosapic].gsi_base = gsi_base; + iosapic_lists[num_iosapic].num_rte = num_rte; +#ifdef CONFIG_NUMA + iosapic_lists[num_iosapic].node = MAX_NUMNODES; +#endif + num_iosapic++; + + if ((gsi_base == 0) && pcat_compat) { + /* + * Map the legacy ISA devices into the IOSAPIC data. Some of these may + * get reprogrammed later on with data from the ACPI Interrupt Source + * Override table. + */ + for (isa_irq = 0; isa_irq < 16; ++isa_irq) + iosapic_override_isa_irq(isa_irq, isa_irq, IOSAPIC_POL_HIGH, IOSAPIC_EDGE); + } +} + +#ifdef CONFIG_NUMA +void __init +map_iosapic_to_node(unsigned int gsi_base, int node) +{ + int index; + + index = find_iosapic(gsi_base); + if (index < 0) { + printk(KERN_WARNING "%s: No IOSAPIC for GSI %u\n", + __FUNCTION__, gsi_base); + return; + } + iosapic_lists[index].node = node; + return; +} +#endif diff --git a/arch/ia64/kernel/irq.c b/arch/ia64/kernel/irq.c new file mode 100644 index 000000000000..28f2aadc38d0 --- /dev/null +++ b/arch/ia64/kernel/irq.c @@ -0,0 +1,238 @@ +/* + * linux/arch/ia64/kernel/irq.c + * + * Copyright (C) 1992, 1998 Linus Torvalds, Ingo Molnar + * + * This file contains the code used by various IRQ handling routines: + * asking for different IRQ's should be done through these routines + * instead of just grabbing them. Thus setups with different IRQ numbers + * shouldn't result in any weird surprises, and installing new handlers + * should be easier. + * + * Copyright (C) Ashok Raj<ashok.raj@intel.com>, Intel Corporation 2004 + * + * 4/14/2004: Added code to handle cpu migration and do safe irq + * migration without lossing interrupts for iosapic + * architecture. + */ + +#include <asm/delay.h> +#include <asm/uaccess.h> +#include <linux/module.h> +#include <linux/seq_file.h> +#include <linux/interrupt.h> +#include <linux/kernel_stat.h> + +/* + * 'what should we do if we get a hw irq event on an illegal vector'. + * each architecture has to answer this themselves. + */ +void ack_bad_irq(unsigned int irq) +{ + printk(KERN_ERR "Unexpected irq vector 0x%x on CPU %u!\n", irq, smp_processor_id()); +} + +#ifdef CONFIG_IA64_GENERIC +unsigned int __ia64_local_vector_to_irq (ia64_vector vec) +{ + return (unsigned int) vec; +} +#endif + +/* + * Interrupt statistics: + */ + +atomic_t irq_err_count; + +/* + * /proc/interrupts printing: + */ + +int show_interrupts(struct seq_file *p, void *v) +{ + int i = *(loff_t *) v, j; + struct irqaction * action; + unsigned long flags; + + if (i == 0) { + seq_printf(p, " "); + for (j=0; j<NR_CPUS; j++) + if (cpu_online(j)) + seq_printf(p, "CPU%d ",j); + seq_putc(p, '\n'); + } + + if (i < NR_IRQS) { + spin_lock_irqsave(&irq_desc[i].lock, flags); + action = irq_desc[i].action; + if (!action) + goto skip; + seq_printf(p, "%3d: ",i); +#ifndef CONFIG_SMP + seq_printf(p, "%10u ", kstat_irqs(i)); +#else + for (j = 0; j < NR_CPUS; j++) + if (cpu_online(j)) + seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]); +#endif + seq_printf(p, " %14s", irq_desc[i].handler->typename); + seq_printf(p, " %s", action->name); + + for (action=action->next; action; action = action->next) + seq_printf(p, ", %s", action->name); + + seq_putc(p, '\n'); +skip: + spin_unlock_irqrestore(&irq_desc[i].lock, flags); + } else if (i == NR_IRQS) + seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count)); + return 0; +} + +#ifdef CONFIG_SMP +/* + * This is updated when the user sets irq affinity via /proc + */ +static cpumask_t __cacheline_aligned pending_irq_cpumask[NR_IRQS]; +static unsigned long pending_irq_redir[BITS_TO_LONGS(NR_IRQS)]; + +static char irq_redir [NR_IRQS]; // = { [0 ... NR_IRQS-1] = 1 }; + +/* + * Arch specific routine for deferred write to iosapic rte to reprogram + * intr destination. + */ +void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val) +{ + pending_irq_cpumask[irq] = mask_val; +} + +void set_irq_affinity_info (unsigned int irq, int hwid, int redir) +{ + cpumask_t mask = CPU_MASK_NONE; + + cpu_set(cpu_logical_id(hwid), mask); + + if (irq < NR_IRQS) { + irq_affinity[irq] = mask; + irq_redir[irq] = (char) (redir & 0xff); + } +} + + +void move_irq(int irq) +{ + /* note - we hold desc->lock */ + cpumask_t tmp; + irq_desc_t *desc = irq_descp(irq); + int redir = test_bit(irq, pending_irq_redir); + + if (unlikely(!desc->handler->set_affinity)) + return; + + if (!cpus_empty(pending_irq_cpumask[irq])) { + cpus_and(tmp, pending_irq_cpumask[irq], cpu_online_map); + if (unlikely(!cpus_empty(tmp))) { + desc->handler->set_affinity(irq | (redir ? IA64_IRQ_REDIRECTED : 0), + pending_irq_cpumask[irq]); + } + cpus_clear(pending_irq_cpumask[irq]); + } +} + + +#endif /* CONFIG_SMP */ + +#ifdef CONFIG_HOTPLUG_CPU +unsigned int vectors_in_migration[NR_IRQS]; + +/* + * Since cpu_online_map is already updated, we just need to check for + * affinity that has zeros + */ +static void migrate_irqs(void) +{ + cpumask_t mask; + irq_desc_t *desc; + int irq, new_cpu; + + for (irq=0; irq < NR_IRQS; irq++) { + desc = irq_descp(irq); + + /* + * No handling for now. + * TBD: Implement a disable function so we can now + * tell CPU not to respond to these local intr sources. + * such as ITV,CPEI,MCA etc. + */ + if (desc->status == IRQ_PER_CPU) + continue; + + cpus_and(mask, irq_affinity[irq], cpu_online_map); + if (any_online_cpu(mask) == NR_CPUS) { + /* + * Save it for phase 2 processing + */ + vectors_in_migration[irq] = irq; + + new_cpu = any_online_cpu(cpu_online_map); + mask = cpumask_of_cpu(new_cpu); + + /* + * Al three are essential, currently WARN_ON.. maybe panic? + */ + if (desc->handler && desc->handler->disable && + desc->handler->enable && desc->handler->set_affinity) { + desc->handler->disable(irq); + desc->handler->set_affinity(irq, mask); + desc->handler->enable(irq); + } else { + WARN_ON((!(desc->handler) || !(desc->handler->disable) || + !(desc->handler->enable) || + !(desc->handler->set_affinity))); + } + } + } +} + +void fixup_irqs(void) +{ + unsigned int irq; + extern void ia64_process_pending_intr(void); + + ia64_set_itv(1<<16); + /* + * Phase 1: Locate irq's bound to this cpu and + * relocate them for cpu removal. + */ + migrate_irqs(); + + /* + * Phase 2: Perform interrupt processing for all entries reported in + * local APIC. + */ + ia64_process_pending_intr(); + + /* + * Phase 3: Now handle any interrupts not captured in local APIC. + * This is to account for cases that device interrupted during the time the + * rte was being disabled and re-programmed. + */ + for (irq=0; irq < NR_IRQS; irq++) { + if (vectors_in_migration[irq]) { + vectors_in_migration[irq]=0; + __do_IRQ(irq, NULL); + } + } + + /* + * Now let processor die. We do irq disable and max_xtp() to + * ensure there is no more interrupts routed to this processor. + * But the local timer interrupt can have 1 pending which we + * take care in timer_interrupt(). + */ + max_xtp(); + local_irq_disable(); +} +#endif diff --git a/arch/ia64/kernel/irq_ia64.c b/arch/ia64/kernel/irq_ia64.c new file mode 100644 index 000000000000..5ba06ebe355b --- /dev/null +++ b/arch/ia64/kernel/irq_ia64.c @@ -0,0 +1,278 @@ +/* + * linux/arch/ia64/kernel/irq.c + * + * Copyright (C) 1998-2001 Hewlett-Packard Co + * Stephane Eranian <eranian@hpl.hp.com> + * David Mosberger-Tang <davidm@hpl.hp.com> + * + * 6/10/99: Updated to bring in sync with x86 version to facilitate + * support for SMP and different interrupt controllers. + * + * 09/15/00 Goutham Rao <goutham.rao@intel.com> Implemented pci_irq_to_vector + * PCI to vector allocation routine. + * 04/14/2004 Ashok Raj <ashok.raj@intel.com> + * Added CPU Hotplug handling for IPF. + */ + +#include <linux/config.h> +#include <linux/module.h> + +#include <linux/jiffies.h> +#include <linux/errno.h> +#include <linux/init.h> +#include <linux/interrupt.h> +#include <linux/ioport.h> +#include <linux/kernel_stat.h> +#include <linux/slab.h> +#include <linux/ptrace.h> +#include <linux/random.h> /* for rand_initialize_irq() */ +#include <linux/signal.h> +#include <linux/smp.h> +#include <linux/smp_lock.h> +#include <linux/threads.h> +#include <linux/bitops.h> + +#include <asm/delay.h> +#include <asm/intrinsics.h> +#include <asm/io.h> +#include <asm/hw_irq.h> +#include <asm/machvec.h> +#include <asm/pgtable.h> +#include <asm/system.h> + +#ifdef CONFIG_PERFMON +# include <asm/perfmon.h> +#endif + +#define IRQ_DEBUG 0 + +/* default base addr of IPI table */ +void __iomem *ipi_base_addr = ((void __iomem *) + (__IA64_UNCACHED_OFFSET | IA64_IPI_DEFAULT_BASE_ADDR)); + +/* + * Legacy IRQ to IA-64 vector translation table. + */ +__u8 isa_irq_to_vector_map[16] = { + /* 8259 IRQ translation, first 16 entries */ + 0x2f, 0x20, 0x2e, 0x2d, 0x2c, 0x2b, 0x2a, 0x29, + 0x28, 0x27, 0x26, 0x25, 0x24, 0x23, 0x22, 0x21 +}; +EXPORT_SYMBOL(isa_irq_to_vector_map); + +static unsigned long ia64_vector_mask[BITS_TO_LONGS(IA64_NUM_DEVICE_VECTORS)]; + +int +assign_irq_vector (int irq) +{ + int pos, vector; + again: + pos = find_first_zero_bit(ia64_vector_mask, IA64_NUM_DEVICE_VECTORS); + vector = IA64_FIRST_DEVICE_VECTOR + pos; + if (vector > IA64_LAST_DEVICE_VECTOR) + /* XXX could look for sharable vectors instead of panic'ing... */ + panic("assign_irq_vector: out of interrupt vectors!"); + if (test_and_set_bit(pos, ia64_vector_mask)) + goto again; + return vector; +} + +void +free_irq_vector (int vector) +{ + int pos; + + if (vector < IA64_FIRST_DEVICE_VECTOR || vector > IA64_LAST_DEVICE_VECTOR) + return; + + pos = vector - IA64_FIRST_DEVICE_VECTOR; + if (!test_and_clear_bit(pos, ia64_vector_mask)) + printk(KERN_WARNING "%s: double free!\n", __FUNCTION__); +} + +#ifdef CONFIG_SMP +# define IS_RESCHEDULE(vec) (vec == IA64_IPI_RESCHEDULE) +#else +# define IS_RESCHEDULE(vec) (0) +#endif +/* + * That's where the IVT branches when we get an external + * interrupt. This branches to the correct hardware IRQ handler via + * function ptr. + */ +void +ia64_handle_irq (ia64_vector vector, struct pt_regs *regs) +{ + unsigned long saved_tpr; + +#if IRQ_DEBUG + { + unsigned long bsp, sp; + + /* + * Note: if the interrupt happened while executing in + * the context switch routine (ia64_switch_to), we may + * get a spurious stack overflow here. This is + * because the register and the memory stack are not + * switched atomically. + */ + bsp = ia64_getreg(_IA64_REG_AR_BSP); + sp = ia64_getreg(_IA64_REG_SP); + + if ((sp - bsp) < 1024) { + static unsigned char count; + static long last_time; + + if (jiffies - last_time > 5*HZ) + count = 0; + if (++count < 5) { + last_time = jiffies; + printk("ia64_handle_irq: DANGER: less than " + "1KB of free stack space!!\n" + "(bsp=0x%lx, sp=%lx)\n", bsp, sp); + } + } + } +#endif /* IRQ_DEBUG */ + + /* + * Always set TPR to limit maximum interrupt nesting depth to + * 16 (without this, it would be ~240, which could easily lead + * to kernel stack overflows). + */ + irq_enter(); + saved_tpr = ia64_getreg(_IA64_REG_CR_TPR); + ia64_srlz_d(); + while (vector != IA64_SPURIOUS_INT_VECTOR) { + if (!IS_RESCHEDULE(vector)) { + ia64_setreg(_IA64_REG_CR_TPR, vector); + ia64_srlz_d(); + + __do_IRQ(local_vector_to_irq(vector), regs); + + /* + * Disable interrupts and send EOI: + */ + local_irq_disable(); + ia64_setreg(_IA64_REG_CR_TPR, saved_tpr); + } + ia64_eoi(); + vector = ia64_get_ivr(); + } + /* + * This must be done *after* the ia64_eoi(). For example, the keyboard softirq + * handler needs to be able to wait for further keyboard interrupts, which can't + * come through until ia64_eoi() has been done. + */ + irq_exit(); +} + +#ifdef CONFIG_HOTPLUG_CPU +/* + * This function emulates a interrupt processing when a cpu is about to be + * brought down. + */ +void ia64_process_pending_intr(void) +{ + ia64_vector vector; + unsigned long saved_tpr; + extern unsigned int vectors_in_migration[NR_IRQS]; + + vector = ia64_get_ivr(); + + irq_enter(); + saved_tpr = ia64_getreg(_IA64_REG_CR_TPR); + ia64_srlz_d(); + + /* + * Perform normal interrupt style processing + */ + while (vector != IA64_SPURIOUS_INT_VECTOR) { + if (!IS_RESCHEDULE(vector)) { + ia64_setreg(_IA64_REG_CR_TPR, vector); + ia64_srlz_d(); + + /* + * Now try calling normal ia64_handle_irq as it would have got called + * from a real intr handler. Try passing null for pt_regs, hopefully + * it will work. I hope it works!. + * Probably could shared code. + */ + vectors_in_migration[local_vector_to_irq(vector)]=0; + __do_IRQ(local_vector_to_irq(vector), NULL); + + /* + * Disable interrupts and send EOI + */ + local_irq_disable(); + ia64_setreg(_IA64_REG_CR_TPR, saved_tpr); + } + ia64_eoi(); + vector = ia64_get_ivr(); + } + irq_exit(); +} +#endif + + +#ifdef CONFIG_SMP +extern irqreturn_t handle_IPI (int irq, void *dev_id, struct pt_regs *regs); + +static struct irqaction ipi_irqaction = { + .handler = handle_IPI, + .flags = SA_INTERRUPT, + .name = "IPI" +}; +#endif + +void +register_percpu_irq (ia64_vector vec, struct irqaction *action) +{ + irq_desc_t *desc; + unsigned int irq; + + for (irq = 0; irq < NR_IRQS; ++irq) + if (irq_to_vector(irq) == vec) { + desc = irq_descp(irq); + desc->status |= IRQ_PER_CPU; + desc->handler = &irq_type_ia64_lsapic; + if (action) + setup_irq(irq, action); + } +} + +void __init +init_IRQ (void) +{ + register_percpu_irq(IA64_SPURIOUS_INT_VECTOR, NULL); +#ifdef CONFIG_SMP + register_percpu_irq(IA64_IPI_VECTOR, &ipi_irqaction); +#endif +#ifdef CONFIG_PERFMON + pfm_init_percpu(); +#endif + platform_irq_init(); +} + +void +ia64_send_ipi (int cpu, int vector, int delivery_mode, int redirect) +{ + void __iomem *ipi_addr; + unsigned long ipi_data; + unsigned long phys_cpu_id; + +#ifdef CONFIG_SMP + phys_cpu_id = cpu_physical_id(cpu); +#else + phys_cpu_id = (ia64_getreg(_IA64_REG_CR_LID) >> 16) & 0xffff; +#endif + + /* + * cpu number is in 8bit ID and 8bit EID + */ + + ipi_data = (delivery_mode << 8) | (vector & 0xff); + ipi_addr = ipi_base_addr + ((phys_cpu_id << 4) | ((redirect & 1) << 3)); + + writeq(ipi_data, ipi_addr); +} diff --git a/arch/ia64/kernel/irq_lsapic.c b/arch/ia64/kernel/irq_lsapic.c new file mode 100644 index 000000000000..ea14e6a04409 --- /dev/null +++ b/arch/ia64/kernel/irq_lsapic.c @@ -0,0 +1,37 @@ +/* + * LSAPIC Interrupt Controller + * + * This takes care of interrupts that are generated by the CPU's + * internal Streamlined Advanced Programmable Interrupt Controller + * (LSAPIC), such as the ITC and IPI interrupts. + * + * Copyright (C) 1999 VA Linux Systems + * Copyright (C) 1999 Walt Drummond <drummond@valinux.com> + * Copyright (C) 2000 Hewlett-Packard Co + * Copyright (C) 2000 David Mosberger-Tang <davidm@hpl.hp.com> + */ + +#include <linux/sched.h> +#include <linux/irq.h> + +static unsigned int +lsapic_noop_startup (unsigned int irq) +{ + return 0; +} + +static void +lsapic_noop (unsigned int irq) +{ + /* nuthing to do... */ +} + +struct hw_interrupt_type irq_type_ia64_lsapic = { + .typename = "LSAPIC", + .startup = lsapic_noop_startup, + .shutdown = lsapic_noop, + .enable = lsapic_noop, + .disable = lsapic_noop, + .ack = lsapic_noop, + .end = lsapic_noop +}; diff --git a/arch/ia64/kernel/ivt.S b/arch/ia64/kernel/ivt.S new file mode 100644 index 000000000000..d9c05d53435b --- /dev/null +++ b/arch/ia64/kernel/ivt.S @@ -0,0 +1,1619 @@ +/* + * arch/ia64/kernel/ivt.S + * + * Copyright (C) 1998-2001, 2003 Hewlett-Packard Co + * Stephane Eranian <eranian@hpl.hp.com> + * David Mosberger <davidm@hpl.hp.com> + * Copyright (C) 2000, 2002-2003 Intel Co + * Asit Mallick <asit.k.mallick@intel.com> + * Suresh Siddha <suresh.b.siddha@intel.com> + * Kenneth Chen <kenneth.w.chen@intel.com> + * Fenghua Yu <fenghua.yu@intel.com> + * + * 00/08/23 Asit Mallick <asit.k.mallick@intel.com> TLB handling for SMP + * 00/12/20 David Mosberger-Tang <davidm@hpl.hp.com> DTLB/ITLB handler now uses virtual PT. + */ +/* + * This file defines the interruption vector table used by the CPU. + * It does not include one entry per possible cause of interruption. + * + * The first 20 entries of the table contain 64 bundles each while the + * remaining 48 entries contain only 16 bundles each. + * + * The 64 bundles are used to allow inlining the whole handler for critical + * interruptions like TLB misses. + * + * For each entry, the comment is as follows: + * + * // 0x1c00 Entry 7 (size 64 bundles) Data Key Miss (12,51) + * entry offset ----/ / / / / + * entry number ---------/ / / / + * size of the entry -------------/ / / + * vector name -------------------------------------/ / + * interruptions triggering this vector ----------------------/ + * + * The table is 32KB in size and must be aligned on 32KB boundary. + * (The CPU ignores the 15 lower bits of the address) + * + * Table is based upon EAS2.6 (Oct 1999) + */ + +#include <linux/config.h> + +#include <asm/asmmacro.h> +#include <asm/break.h> +#include <asm/ia32.h> +#include <asm/kregs.h> +#include <asm/offsets.h> +#include <asm/pgtable.h> +#include <asm/processor.h> +#include <asm/ptrace.h> +#include <asm/system.h> +#include <asm/thread_info.h> +#include <asm/unistd.h> +#include <asm/errno.h> + +#if 1 +# define PSR_DEFAULT_BITS psr.ac +#else +# define PSR_DEFAULT_BITS 0 +#endif + +#if 0 + /* + * This lets you track the last eight faults that occurred on the CPU. Make sure ar.k2 isn't + * needed for something else before enabling this... + */ +# define DBG_FAULT(i) mov r16=ar.k2;; shl r16=r16,8;; add r16=(i),r16;;mov ar.k2=r16 +#else +# define DBG_FAULT(i) +#endif + +#define MINSTATE_VIRT /* needed by minstate.h */ +#include "minstate.h" + +#define FAULT(n) \ + mov r31=pr; \ + mov r19=n;; /* prepare to save predicates */ \ + br.sptk.many dispatch_to_fault_handler + + .section .text.ivt,"ax" + + .align 32768 // align on 32KB boundary + .global ia64_ivt +ia64_ivt: +///////////////////////////////////////////////////////////////////////////////////////// +// 0x0000 Entry 0 (size 64 bundles) VHPT Translation (8,20,47) +ENTRY(vhpt_miss) + DBG_FAULT(0) + /* + * The VHPT vector is invoked when the TLB entry for the virtual page table + * is missing. This happens only as a result of a previous + * (the "original") TLB miss, which may either be caused by an instruction + * fetch or a data access (or non-access). + * + * What we do here is normal TLB miss handing for the _original_ miss, followed + * by inserting the TLB entry for the virtual page table page that the VHPT + * walker was attempting to access. The latter gets inserted as long + * as both L1 and L2 have valid mappings for the faulting address. + * The TLB entry for the original miss gets inserted only if + * the L3 entry indicates that the page is present. + * + * do_page_fault gets invoked in the following cases: + * - the faulting virtual address uses unimplemented address bits + * - the faulting virtual address has no L1, L2, or L3 mapping + */ + mov r16=cr.ifa // get address that caused the TLB miss +#ifdef CONFIG_HUGETLB_PAGE + movl r18=PAGE_SHIFT + mov r25=cr.itir +#endif + ;; + rsm psr.dt // use physical addressing for data + mov r31=pr // save the predicate registers + mov r19=IA64_KR(PT_BASE) // get page table base address + shl r21=r16,3 // shift bit 60 into sign bit + shr.u r17=r16,61 // get the region number into r17 + ;; + shr r22=r21,3 +#ifdef CONFIG_HUGETLB_PAGE + extr.u r26=r25,2,6 + ;; + cmp.ne p8,p0=r18,r26 + sub r27=r26,r18 + ;; +(p8) dep r25=r18,r25,2,6 +(p8) shr r22=r22,r27 +#endif + ;; + cmp.eq p6,p7=5,r17 // is IFA pointing into to region 5? + shr.u r18=r22,PGDIR_SHIFT // get bits 33-63 of the faulting address + ;; +(p7) dep r17=r17,r19,(PAGE_SHIFT-3),3 // put region number bits in place + + srlz.d + LOAD_PHYSICAL(p6, r19, swapper_pg_dir) // region 5 is rooted at swapper_pg_dir + + .pred.rel "mutex", p6, p7 +(p6) shr.u r21=r21,PGDIR_SHIFT+PAGE_SHIFT +(p7) shr.u r21=r21,PGDIR_SHIFT+PAGE_SHIFT-3 + ;; +(p6) dep r17=r18,r19,3,(PAGE_SHIFT-3) // r17=PTA + IFA(33,42)*8 +(p7) dep r17=r18,r17,3,(PAGE_SHIFT-6) // r17=PTA + (((IFA(61,63) << 7) | IFA(33,39))*8) + cmp.eq p7,p6=0,r21 // unused address bits all zeroes? + shr.u r18=r22,PMD_SHIFT // shift L2 index into position + ;; + ld8 r17=[r17] // fetch the L1 entry (may be 0) + ;; +(p7) cmp.eq p6,p7=r17,r0 // was L1 entry NULL? + dep r17=r18,r17,3,(PAGE_SHIFT-3) // compute address of L2 page table entry + ;; +(p7) ld8 r20=[r17] // fetch the L2 entry (may be 0) + shr.u r19=r22,PAGE_SHIFT // shift L3 index into position + ;; +(p7) cmp.eq.or.andcm p6,p7=r20,r0 // was L2 entry NULL? + dep r21=r19,r20,3,(PAGE_SHIFT-3) // compute address of L3 page table entry + ;; +(p7) ld8 r18=[r21] // read the L3 PTE + mov r19=cr.isr // cr.isr bit 0 tells us if this is an insn miss + ;; +(p7) tbit.z p6,p7=r18,_PAGE_P_BIT // page present bit cleared? + mov r22=cr.iha // get the VHPT address that caused the TLB miss + ;; // avoid RAW on p7 +(p7) tbit.nz.unc p10,p11=r19,32 // is it an instruction TLB miss? + dep r23=0,r20,0,PAGE_SHIFT // clear low bits to get page address + ;; +(p10) itc.i r18 // insert the instruction TLB entry +(p11) itc.d r18 // insert the data TLB entry +(p6) br.cond.spnt.many page_fault // handle bad address/page not present (page fault) + mov cr.ifa=r22 + +#ifdef CONFIG_HUGETLB_PAGE +(p8) mov cr.itir=r25 // change to default page-size for VHPT +#endif + + /* + * Now compute and insert the TLB entry for the virtual page table. We never + * execute in a page table page so there is no need to set the exception deferral + * bit. + */ + adds r24=__DIRTY_BITS_NO_ED|_PAGE_PL_0|_PAGE_AR_RW,r23 + ;; +(p7) itc.d r24 + ;; +#ifdef CONFIG_SMP + /* + * Tell the assemblers dependency-violation checker that the above "itc" instructions + * cannot possibly affect the following loads: + */ + dv_serialize_data + + /* + * Re-check L2 and L3 pagetable. If they changed, we may have received a ptc.g + * between reading the pagetable and the "itc". If so, flush the entry we + * inserted and retry. + */ + ld8 r25=[r21] // read L3 PTE again + ld8 r26=[r17] // read L2 entry again + ;; + cmp.ne p6,p7=r26,r20 // did L2 entry change + mov r27=PAGE_SHIFT<<2 + ;; +(p6) ptc.l r22,r27 // purge PTE page translation +(p7) cmp.ne.or.andcm p6,p7=r25,r18 // did L3 PTE change + ;; +(p6) ptc.l r16,r27 // purge translation +#endif + + mov pr=r31,-1 // restore predicate registers + rfi +END(vhpt_miss) + + .org ia64_ivt+0x400 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x0400 Entry 1 (size 64 bundles) ITLB (21) +ENTRY(itlb_miss) + DBG_FAULT(1) + /* + * The ITLB handler accesses the L3 PTE via the virtually mapped linear + * page table. If a nested TLB miss occurs, we switch into physical + * mode, walk the page table, and then re-execute the L3 PTE read + * and go on normally after that. + */ + mov r16=cr.ifa // get virtual address + mov r29=b0 // save b0 + mov r31=pr // save predicates +.itlb_fault: + mov r17=cr.iha // get virtual address of L3 PTE + movl r30=1f // load nested fault continuation point + ;; +1: ld8 r18=[r17] // read L3 PTE + ;; + mov b0=r29 + tbit.z p6,p0=r18,_PAGE_P_BIT // page present bit cleared? +(p6) br.cond.spnt page_fault + ;; + itc.i r18 + ;; +#ifdef CONFIG_SMP + /* + * Tell the assemblers dependency-violation checker that the above "itc" instructions + * cannot possibly affect the following loads: + */ + dv_serialize_data + + ld8 r19=[r17] // read L3 PTE again and see if same + mov r20=PAGE_SHIFT<<2 // setup page size for purge + ;; + cmp.ne p7,p0=r18,r19 + ;; +(p7) ptc.l r16,r20 +#endif + mov pr=r31,-1 + rfi +END(itlb_miss) + + .org ia64_ivt+0x0800 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x0800 Entry 2 (size 64 bundles) DTLB (9,48) +ENTRY(dtlb_miss) + DBG_FAULT(2) + /* + * The DTLB handler accesses the L3 PTE via the virtually mapped linear + * page table. If a nested TLB miss occurs, we switch into physical + * mode, walk the page table, and then re-execute the L3 PTE read + * and go on normally after that. + */ + mov r16=cr.ifa // get virtual address + mov r29=b0 // save b0 + mov r31=pr // save predicates +dtlb_fault: + mov r17=cr.iha // get virtual address of L3 PTE + movl r30=1f // load nested fault continuation point + ;; +1: ld8 r18=[r17] // read L3 PTE + ;; + mov b0=r29 + tbit.z p6,p0=r18,_PAGE_P_BIT // page present bit cleared? +(p6) br.cond.spnt page_fault + ;; + itc.d r18 + ;; +#ifdef CONFIG_SMP + /* + * Tell the assemblers dependency-violation checker that the above "itc" instructions + * cannot possibly affect the following loads: + */ + dv_serialize_data + + ld8 r19=[r17] // read L3 PTE again and see if same + mov r20=PAGE_SHIFT<<2 // setup page size for purge + ;; + cmp.ne p7,p0=r18,r19 + ;; +(p7) ptc.l r16,r20 +#endif + mov pr=r31,-1 + rfi +END(dtlb_miss) + + .org ia64_ivt+0x0c00 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x0c00 Entry 3 (size 64 bundles) Alt ITLB (19) +ENTRY(alt_itlb_miss) + DBG_FAULT(3) + mov r16=cr.ifa // get address that caused the TLB miss + movl r17=PAGE_KERNEL + mov r21=cr.ipsr + movl r19=(((1 << IA64_MAX_PHYS_BITS) - 1) & ~0xfff) + mov r31=pr + ;; +#ifdef CONFIG_DISABLE_VHPT + shr.u r22=r16,61 // get the region number into r21 + ;; + cmp.gt p8,p0=6,r22 // user mode + ;; +(p8) thash r17=r16 + ;; +(p8) mov cr.iha=r17 +(p8) mov r29=b0 // save b0 +(p8) br.cond.dptk .itlb_fault +#endif + extr.u r23=r21,IA64_PSR_CPL0_BIT,2 // extract psr.cpl + and r19=r19,r16 // clear ed, reserved bits, and PTE control bits + shr.u r18=r16,57 // move address bit 61 to bit 4 + ;; + andcm r18=0x10,r18 // bit 4=~address-bit(61) + cmp.ne p8,p0=r0,r23 // psr.cpl != 0? + or r19=r17,r19 // insert PTE control bits into r19 + ;; + or r19=r19,r18 // set bit 4 (uncached) if the access was to region 6 +(p8) br.cond.spnt page_fault + ;; + itc.i r19 // insert the TLB entry + mov pr=r31,-1 + rfi +END(alt_itlb_miss) + + .org ia64_ivt+0x1000 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x1000 Entry 4 (size 64 bundles) Alt DTLB (7,46) +ENTRY(alt_dtlb_miss) + DBG_FAULT(4) + mov r16=cr.ifa // get address that caused the TLB miss + movl r17=PAGE_KERNEL + mov r20=cr.isr + movl r19=(((1 << IA64_MAX_PHYS_BITS) - 1) & ~0xfff) + mov r21=cr.ipsr + mov r31=pr + ;; +#ifdef CONFIG_DISABLE_VHPT + shr.u r22=r16,61 // get the region number into r21 + ;; + cmp.gt p8,p0=6,r22 // access to region 0-5 + ;; +(p8) thash r17=r16 + ;; +(p8) mov cr.iha=r17 +(p8) mov r29=b0 // save b0 +(p8) br.cond.dptk dtlb_fault +#endif + extr.u r23=r21,IA64_PSR_CPL0_BIT,2 // extract psr.cpl + and r22=IA64_ISR_CODE_MASK,r20 // get the isr.code field + tbit.nz p6,p7=r20,IA64_ISR_SP_BIT // is speculation bit on? + shr.u r18=r16,57 // move address bit 61 to bit 4 + and r19=r19,r16 // clear ed, reserved bits, and PTE control bits + tbit.nz p9,p0=r20,IA64_ISR_NA_BIT // is non-access bit on? + ;; + andcm r18=0x10,r18 // bit 4=~address-bit(61) + cmp.ne p8,p0=r0,r23 +(p9) cmp.eq.or.andcm p6,p7=IA64_ISR_CODE_LFETCH,r22 // check isr.code field +(p8) br.cond.spnt page_fault + + dep r21=-1,r21,IA64_PSR_ED_BIT,1 + or r19=r19,r17 // insert PTE control bits into r19 + ;; + or r19=r19,r18 // set bit 4 (uncached) if the access was to region 6 +(p6) mov cr.ipsr=r21 + ;; +(p7) itc.d r19 // insert the TLB entry + mov pr=r31,-1 + rfi +END(alt_dtlb_miss) + + .org ia64_ivt+0x1400 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x1400 Entry 5 (size 64 bundles) Data nested TLB (6,45) +ENTRY(nested_dtlb_miss) + /* + * In the absence of kernel bugs, we get here when the virtually mapped linear + * page table is accessed non-speculatively (e.g., in the Dirty-bit, Instruction + * Access-bit, or Data Access-bit faults). If the DTLB entry for the virtual page + * table is missing, a nested TLB miss fault is triggered and control is + * transferred to this point. When this happens, we lookup the pte for the + * faulting address by walking the page table in physical mode and return to the + * continuation point passed in register r30 (or call page_fault if the address is + * not mapped). + * + * Input: r16: faulting address + * r29: saved b0 + * r30: continuation address + * r31: saved pr + * + * Output: r17: physical address of L3 PTE of faulting address + * r29: saved b0 + * r30: continuation address + * r31: saved pr + * + * Clobbered: b0, r18, r19, r21, psr.dt (cleared) + */ + rsm psr.dt // switch to using physical data addressing + mov r19=IA64_KR(PT_BASE) // get the page table base address + shl r21=r16,3 // shift bit 60 into sign bit + ;; + shr.u r17=r16,61 // get the region number into r17 + ;; + cmp.eq p6,p7=5,r17 // is faulting address in region 5? + shr.u r18=r16,PGDIR_SHIFT // get bits 33-63 of faulting address + ;; +(p7) dep r17=r17,r19,(PAGE_SHIFT-3),3 // put region number bits in place + + srlz.d + LOAD_PHYSICAL(p6, r19, swapper_pg_dir) // region 5 is rooted at swapper_pg_dir + + .pred.rel "mutex", p6, p7 +(p6) shr.u r21=r21,PGDIR_SHIFT+PAGE_SHIFT +(p7) shr.u r21=r21,PGDIR_SHIFT+PAGE_SHIFT-3 + ;; +(p6) dep r17=r18,r19,3,(PAGE_SHIFT-3) // r17=PTA + IFA(33,42)*8 +(p7) dep r17=r18,r17,3,(PAGE_SHIFT-6) // r17=PTA + (((IFA(61,63) << 7) | IFA(33,39))*8) + cmp.eq p7,p6=0,r21 // unused address bits all zeroes? + shr.u r18=r16,PMD_SHIFT // shift L2 index into position + ;; + ld8 r17=[r17] // fetch the L1 entry (may be 0) + ;; +(p7) cmp.eq p6,p7=r17,r0 // was L1 entry NULL? + dep r17=r18,r17,3,(PAGE_SHIFT-3) // compute address of L2 page table entry + ;; +(p7) ld8 r17=[r17] // fetch the L2 entry (may be 0) + shr.u r19=r16,PAGE_SHIFT // shift L3 index into position + ;; +(p7) cmp.eq.or.andcm p6,p7=r17,r0 // was L2 entry NULL? + dep r17=r19,r17,3,(PAGE_SHIFT-3) // compute address of L3 page table entry +(p6) br.cond.spnt page_fault + mov b0=r30 + br.sptk.many b0 // return to continuation point +END(nested_dtlb_miss) + + .org ia64_ivt+0x1800 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x1800 Entry 6 (size 64 bundles) Instruction Key Miss (24) +ENTRY(ikey_miss) + DBG_FAULT(6) + FAULT(6) +END(ikey_miss) + + //----------------------------------------------------------------------------------- + // call do_page_fault (predicates are in r31, psr.dt may be off, r16 is faulting address) +ENTRY(page_fault) + ssm psr.dt + ;; + srlz.i + ;; + SAVE_MIN_WITH_COVER + alloc r15=ar.pfs,0,0,3,0 + mov out0=cr.ifa + mov out1=cr.isr + adds r3=8,r2 // set up second base pointer + ;; + ssm psr.ic | PSR_DEFAULT_BITS + ;; + srlz.i // guarantee that interruption collectin is on + ;; +(p15) ssm psr.i // restore psr.i + movl r14=ia64_leave_kernel + ;; + SAVE_REST + mov rp=r14 + ;; + adds out2=16,r12 // out2 = pointer to pt_regs + br.call.sptk.many b6=ia64_do_page_fault // ignore return address +END(page_fault) + + .org ia64_ivt+0x1c00 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x1c00 Entry 7 (size 64 bundles) Data Key Miss (12,51) +ENTRY(dkey_miss) + DBG_FAULT(7) + FAULT(7) +END(dkey_miss) + + .org ia64_ivt+0x2000 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x2000 Entry 8 (size 64 bundles) Dirty-bit (54) +ENTRY(dirty_bit) + DBG_FAULT(8) + /* + * What we do here is to simply turn on the dirty bit in the PTE. We need to + * update both the page-table and the TLB entry. To efficiently access the PTE, + * we address it through the virtual page table. Most likely, the TLB entry for + * the relevant virtual page table page is still present in the TLB so we can + * normally do this without additional TLB misses. In case the necessary virtual + * page table TLB entry isn't present, we take a nested TLB miss hit where we look + * up the physical address of the L3 PTE and then continue at label 1 below. + */ + mov r16=cr.ifa // get the address that caused the fault + movl r30=1f // load continuation point in case of nested fault + ;; + thash r17=r16 // compute virtual address of L3 PTE + mov r29=b0 // save b0 in case of nested fault + mov r31=pr // save pr +#ifdef CONFIG_SMP + mov r28=ar.ccv // save ar.ccv + ;; +1: ld8 r18=[r17] + ;; // avoid RAW on r18 + mov ar.ccv=r18 // set compare value for cmpxchg + or r25=_PAGE_D|_PAGE_A,r18 // set the dirty and accessed bits + ;; + cmpxchg8.acq r26=[r17],r25,ar.ccv + mov r24=PAGE_SHIFT<<2 + ;; + cmp.eq p6,p7=r26,r18 + ;; +(p6) itc.d r25 // install updated PTE + ;; + /* + * Tell the assemblers dependency-violation checker that the above "itc" instructions + * cannot possibly affect the following loads: + */ + dv_serialize_data + + ld8 r18=[r17] // read PTE again + ;; + cmp.eq p6,p7=r18,r25 // is it same as the newly installed + ;; +(p7) ptc.l r16,r24 + mov b0=r29 // restore b0 + mov ar.ccv=r28 +#else + ;; +1: ld8 r18=[r17] + ;; // avoid RAW on r18 + or r18=_PAGE_D|_PAGE_A,r18 // set the dirty and accessed bits + mov b0=r29 // restore b0 + ;; + st8 [r17]=r18 // store back updated PTE + itc.d r18 // install updated PTE +#endif + mov pr=r31,-1 // restore pr + rfi +END(dirty_bit) + + .org ia64_ivt+0x2400 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x2400 Entry 9 (size 64 bundles) Instruction Access-bit (27) +ENTRY(iaccess_bit) + DBG_FAULT(9) + // Like Entry 8, except for instruction access + mov r16=cr.ifa // get the address that caused the fault + movl r30=1f // load continuation point in case of nested fault + mov r31=pr // save predicates +#ifdef CONFIG_ITANIUM + /* + * Erratum 10 (IFA may contain incorrect address) has "NoFix" status. + */ + mov r17=cr.ipsr + ;; + mov r18=cr.iip + tbit.z p6,p0=r17,IA64_PSR_IS_BIT // IA64 instruction set? + ;; +(p6) mov r16=r18 // if so, use cr.iip instead of cr.ifa +#endif /* CONFIG_ITANIUM */ + ;; + thash r17=r16 // compute virtual address of L3 PTE + mov r29=b0 // save b0 in case of nested fault) +#ifdef CONFIG_SMP + mov r28=ar.ccv // save ar.ccv + ;; +1: ld8 r18=[r17] + ;; + mov ar.ccv=r18 // set compare value for cmpxchg + or r25=_PAGE_A,r18 // set the accessed bit + ;; + cmpxchg8.acq r26=[r17],r25,ar.ccv + mov r24=PAGE_SHIFT<<2 + ;; + cmp.eq p6,p7=r26,r18 + ;; +(p6) itc.i r25 // install updated PTE + ;; + /* + * Tell the assemblers dependency-violation checker that the above "itc" instructions + * cannot possibly affect the following loads: + */ + dv_serialize_data + + ld8 r18=[r17] // read PTE again + ;; + cmp.eq p6,p7=r18,r25 // is it same as the newly installed + ;; +(p7) ptc.l r16,r24 + mov b0=r29 // restore b0 + mov ar.ccv=r28 +#else /* !CONFIG_SMP */ + ;; +1: ld8 r18=[r17] + ;; + or r18=_PAGE_A,r18 // set the accessed bit + mov b0=r29 // restore b0 + ;; + st8 [r17]=r18 // store back updated PTE + itc.i r18 // install updated PTE +#endif /* !CONFIG_SMP */ + mov pr=r31,-1 + rfi +END(iaccess_bit) + + .org ia64_ivt+0x2800 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x2800 Entry 10 (size 64 bundles) Data Access-bit (15,55) +ENTRY(daccess_bit) + DBG_FAULT(10) + // Like Entry 8, except for data access + mov r16=cr.ifa // get the address that caused the fault + movl r30=1f // load continuation point in case of nested fault + ;; + thash r17=r16 // compute virtual address of L3 PTE + mov r31=pr + mov r29=b0 // save b0 in case of nested fault) +#ifdef CONFIG_SMP + mov r28=ar.ccv // save ar.ccv + ;; +1: ld8 r18=[r17] + ;; // avoid RAW on r18 + mov ar.ccv=r18 // set compare value for cmpxchg + or r25=_PAGE_A,r18 // set the dirty bit + ;; + cmpxchg8.acq r26=[r17],r25,ar.ccv + mov r24=PAGE_SHIFT<<2 + ;; + cmp.eq p6,p7=r26,r18 + ;; +(p6) itc.d r25 // install updated PTE + /* + * Tell the assemblers dependency-violation checker that the above "itc" instructions + * cannot possibly affect the following loads: + */ + dv_serialize_data + ;; + ld8 r18=[r17] // read PTE again + ;; + cmp.eq p6,p7=r18,r25 // is it same as the newly installed + ;; +(p7) ptc.l r16,r24 + mov ar.ccv=r28 +#else + ;; +1: ld8 r18=[r17] + ;; // avoid RAW on r18 + or r18=_PAGE_A,r18 // set the accessed bit + ;; + st8 [r17]=r18 // store back updated PTE + itc.d r18 // install updated PTE +#endif + mov b0=r29 // restore b0 + mov pr=r31,-1 + rfi +END(daccess_bit) + + .org ia64_ivt+0x2c00 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x2c00 Entry 11 (size 64 bundles) Break instruction (33) +ENTRY(break_fault) + /* + * The streamlined system call entry/exit paths only save/restore the initial part + * of pt_regs. This implies that the callers of system-calls must adhere to the + * normal procedure calling conventions. + * + * Registers to be saved & restored: + * CR registers: cr.ipsr, cr.iip, cr.ifs + * AR registers: ar.unat, ar.pfs, ar.rsc, ar.rnat, ar.bspstore, ar.fpsr + * others: pr, b0, b6, loadrs, r1, r11, r12, r13, r15 + * Registers to be restored only: + * r8-r11: output value from the system call. + * + * During system call exit, scratch registers (including r15) are modified/cleared + * to prevent leaking bits from kernel to user level. + */ + DBG_FAULT(11) + mov r16=IA64_KR(CURRENT) // r16 = current task; 12 cycle read lat. + mov r17=cr.iim + mov r18=__IA64_BREAK_SYSCALL + mov r21=ar.fpsr + mov r29=cr.ipsr + mov r19=b6 + mov r25=ar.unat + mov r27=ar.rsc + mov r26=ar.pfs + mov r28=cr.iip + mov r31=pr // prepare to save predicates + mov r20=r1 + ;; + adds r16=IA64_TASK_THREAD_ON_USTACK_OFFSET,r16 + cmp.eq p0,p7=r18,r17 // is this a system call? (p7 <- false, if so) +(p7) br.cond.spnt non_syscall + ;; + ld1 r17=[r16] // load current->thread.on_ustack flag + st1 [r16]=r0 // clear current->thread.on_ustack flag + add r1=-IA64_TASK_THREAD_ON_USTACK_OFFSET,r16 // set r1 for MINSTATE_START_SAVE_MIN_VIRT + ;; + invala + + /* adjust return address so we skip over the break instruction: */ + + extr.u r8=r29,41,2 // extract ei field from cr.ipsr + ;; + cmp.eq p6,p7=2,r8 // isr.ei==2? + mov r2=r1 // setup r2 for ia64_syscall_setup + ;; +(p6) mov r8=0 // clear ei to 0 +(p6) adds r28=16,r28 // switch cr.iip to next bundle cr.ipsr.ei wrapped +(p7) adds r8=1,r8 // increment ei to next slot + ;; + cmp.eq pKStk,pUStk=r0,r17 // are we in kernel mode already? + dep r29=r8,r29,41,2 // insert new ei into cr.ipsr + ;; + + // switch from user to kernel RBS: + MINSTATE_START_SAVE_MIN_VIRT + br.call.sptk.many b7=ia64_syscall_setup + ;; + MINSTATE_END_SAVE_MIN_VIRT // switch to bank 1 + ssm psr.ic | PSR_DEFAULT_BITS + ;; + srlz.i // guarantee that interruption collection is on + mov r3=NR_syscalls - 1 + ;; +(p15) ssm psr.i // restore psr.i + // p10==true means out registers are more than 8 or r15's Nat is true +(p10) br.cond.spnt.many ia64_ret_from_syscall + ;; + movl r16=sys_call_table + + adds r15=-1024,r15 // r15 contains the syscall number---subtract 1024 + movl r2=ia64_ret_from_syscall + ;; + shladd r20=r15,3,r16 // r20 = sys_call_table + 8*(syscall-1024) + cmp.leu p6,p7=r15,r3 // (syscall > 0 && syscall < 1024 + NR_syscalls) ? + mov rp=r2 // set the real return addr + ;; +(p6) ld8 r20=[r20] // load address of syscall entry point +(p7) movl r20=sys_ni_syscall + + add r2=TI_FLAGS+IA64_TASK_SIZE,r13 + ;; + ld4 r2=[r2] // r2 = current_thread_info()->flags + ;; + and r2=_TIF_SYSCALL_TRACEAUDIT,r2 // mask trace or audit + ;; + cmp.eq p8,p0=r2,r0 + mov b6=r20 + ;; +(p8) br.call.sptk.many b6=b6 // ignore this return addr + br.cond.sptk ia64_trace_syscall + // NOT REACHED +END(break_fault) + + .org ia64_ivt+0x3000 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x3000 Entry 12 (size 64 bundles) External Interrupt (4) +ENTRY(interrupt) + DBG_FAULT(12) + mov r31=pr // prepare to save predicates + ;; + SAVE_MIN_WITH_COVER // uses r31; defines r2 and r3 + ssm psr.ic | PSR_DEFAULT_BITS + ;; + adds r3=8,r2 // set up second base pointer for SAVE_REST + srlz.i // ensure everybody knows psr.ic is back on + ;; + SAVE_REST + ;; + alloc r14=ar.pfs,0,0,2,0 // must be first in an insn group + mov out0=cr.ivr // pass cr.ivr as first arg + add out1=16,sp // pass pointer to pt_regs as second arg + ;; + srlz.d // make sure we see the effect of cr.ivr + movl r14=ia64_leave_kernel + ;; + mov rp=r14 + br.call.sptk.many b6=ia64_handle_irq +END(interrupt) + + .org ia64_ivt+0x3400 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x3400 Entry 13 (size 64 bundles) Reserved + DBG_FAULT(13) + FAULT(13) + + .org ia64_ivt+0x3800 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x3800 Entry 14 (size 64 bundles) Reserved + DBG_FAULT(14) + FAULT(14) + + /* + * There is no particular reason for this code to be here, other than that + * there happens to be space here that would go unused otherwise. If this + * fault ever gets "unreserved", simply moved the following code to a more + * suitable spot... + * + * ia64_syscall_setup() is a separate subroutine so that it can + * allocate stacked registers so it can safely demine any + * potential NaT values from the input registers. + * + * On entry: + * - executing on bank 0 or bank 1 register set (doesn't matter) + * - r1: stack pointer + * - r2: current task pointer + * - r3: preserved + * - r11: original contents (saved ar.pfs to be saved) + * - r12: original contents (sp to be saved) + * - r13: original contents (tp to be saved) + * - r15: original contents (syscall # to be saved) + * - r18: saved bsp (after switching to kernel stack) + * - r19: saved b6 + * - r20: saved r1 (gp) + * - r21: saved ar.fpsr + * - r22: kernel's register backing store base (krbs_base) + * - r23: saved ar.bspstore + * - r24: saved ar.rnat + * - r25: saved ar.unat + * - r26: saved ar.pfs + * - r27: saved ar.rsc + * - r28: saved cr.iip + * - r29: saved cr.ipsr + * - r31: saved pr + * - b0: original contents (to be saved) + * On exit: + * - executing on bank 1 registers + * - psr.ic enabled, interrupts restored + * - p10: TRUE if syscall is invoked with more than 8 out + * registers or r15's Nat is true + * - r1: kernel's gp + * - r3: preserved (same as on entry) + * - r8: -EINVAL if p10 is true + * - r12: points to kernel stack + * - r13: points to current task + * - p15: TRUE if interrupts need to be re-enabled + * - ar.fpsr: set to kernel settings + */ +GLOBAL_ENTRY(ia64_syscall_setup) +#if PT(B6) != 0 +# error This code assumes that b6 is the first field in pt_regs. +#endif + st8 [r1]=r19 // save b6 + add r16=PT(CR_IPSR),r1 // initialize first base pointer + add r17=PT(R11),r1 // initialize second base pointer + ;; + alloc r19=ar.pfs,8,0,0,0 // ensure in0-in7 are writable + st8 [r16]=r29,PT(AR_PFS)-PT(CR_IPSR) // save cr.ipsr + tnat.nz p8,p0=in0 + + st8.spill [r17]=r11,PT(CR_IIP)-PT(R11) // save r11 + tnat.nz p9,p0=in1 +(pKStk) mov r18=r0 // make sure r18 isn't NaT + ;; + + st8 [r16]=r26,PT(CR_IFS)-PT(AR_PFS) // save ar.pfs + st8 [r17]=r28,PT(AR_UNAT)-PT(CR_IIP) // save cr.iip + mov r28=b0 // save b0 (2 cyc) + ;; + + st8 [r17]=r25,PT(AR_RSC)-PT(AR_UNAT) // save ar.unat + dep r19=0,r19,38,26 // clear all bits but 0..37 [I0] +(p8) mov in0=-1 + ;; + + st8 [r16]=r19,PT(AR_RNAT)-PT(CR_IFS) // store ar.pfs.pfm in cr.ifs + extr.u r11=r19,7,7 // I0 // get sol of ar.pfs + and r8=0x7f,r19 // A // get sof of ar.pfs + + st8 [r17]=r27,PT(AR_BSPSTORE)-PT(AR_RSC)// save ar.rsc + tbit.nz p15,p0=r29,IA64_PSR_I_BIT // I0 +(p9) mov in1=-1 + ;; + +(pUStk) sub r18=r18,r22 // r18=RSE.ndirty*8 + tnat.nz p10,p0=in2 + add r11=8,r11 + ;; +(pKStk) adds r16=PT(PR)-PT(AR_RNAT),r16 // skip over ar_rnat field +(pKStk) adds r17=PT(B0)-PT(AR_BSPSTORE),r17 // skip over ar_bspstore field + tnat.nz p11,p0=in3 + ;; +(p10) mov in2=-1 + tnat.nz p12,p0=in4 // [I0] +(p11) mov in3=-1 + ;; +(pUStk) st8 [r16]=r24,PT(PR)-PT(AR_RNAT) // save ar.rnat +(pUStk) st8 [r17]=r23,PT(B0)-PT(AR_BSPSTORE) // save ar.bspstore + shl r18=r18,16 // compute ar.rsc to be used for "loadrs" + ;; + st8 [r16]=r31,PT(LOADRS)-PT(PR) // save predicates + st8 [r17]=r28,PT(R1)-PT(B0) // save b0 + tnat.nz p13,p0=in5 // [I0] + ;; + st8 [r16]=r18,PT(R12)-PT(LOADRS) // save ar.rsc value for "loadrs" + st8.spill [r17]=r20,PT(R13)-PT(R1) // save original r1 +(p12) mov in4=-1 + ;; + +.mem.offset 0,0; st8.spill [r16]=r12,PT(AR_FPSR)-PT(R12) // save r12 +.mem.offset 8,0; st8.spill [r17]=r13,PT(R15)-PT(R13) // save r13 +(p13) mov in5=-1 + ;; + st8 [r16]=r21,PT(R8)-PT(AR_FPSR) // save ar.fpsr + tnat.nz p14,p0=in6 + cmp.lt p10,p9=r11,r8 // frame size can't be more than local+8 + ;; + stf8 [r16]=f1 // ensure pt_regs.r8 != 0 (see handle_syscall_error) +(p9) tnat.nz p10,p0=r15 + adds r12=-16,r1 // switch to kernel memory stack (with 16 bytes of scratch) + + st8.spill [r17]=r15 // save r15 + tnat.nz p8,p0=in7 + nop.i 0 + + mov r13=r2 // establish `current' + movl r1=__gp // establish kernel global pointer + ;; +(p14) mov in6=-1 +(p8) mov in7=-1 + nop.i 0 + + cmp.eq pSys,pNonSys=r0,r0 // set pSys=1, pNonSys=0 + movl r17=FPSR_DEFAULT + ;; + mov.m ar.fpsr=r17 // set ar.fpsr to kernel default value +(p10) mov r8=-EINVAL + br.ret.sptk.many b7 +END(ia64_syscall_setup) + + .org ia64_ivt+0x3c00 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x3c00 Entry 15 (size 64 bundles) Reserved + DBG_FAULT(15) + FAULT(15) + + /* + * Squatting in this space ... + * + * This special case dispatcher for illegal operation faults allows preserved + * registers to be modified through a callback function (asm only) that is handed + * back from the fault handler in r8. Up to three arguments can be passed to the + * callback function by returning an aggregate with the callback as its first + * element, followed by the arguments. + */ +ENTRY(dispatch_illegal_op_fault) + .prologue + .body + SAVE_MIN_WITH_COVER + ssm psr.ic | PSR_DEFAULT_BITS + ;; + srlz.i // guarantee that interruption collection is on + ;; +(p15) ssm psr.i // restore psr.i + adds r3=8,r2 // set up second base pointer for SAVE_REST + ;; + alloc r14=ar.pfs,0,0,1,0 // must be first in insn group + mov out0=ar.ec + ;; + SAVE_REST + PT_REGS_UNWIND_INFO(0) + ;; + br.call.sptk.many rp=ia64_illegal_op_fault +.ret0: ;; + alloc r14=ar.pfs,0,0,3,0 // must be first in insn group + mov out0=r9 + mov out1=r10 + mov out2=r11 + movl r15=ia64_leave_kernel + ;; + mov rp=r15 + mov b6=r8 + ;; + cmp.ne p6,p0=0,r8 +(p6) br.call.dpnt.many b6=b6 // call returns to ia64_leave_kernel + br.sptk.many ia64_leave_kernel +END(dispatch_illegal_op_fault) + + .org ia64_ivt+0x4000 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x4000 Entry 16 (size 64 bundles) Reserved + DBG_FAULT(16) + FAULT(16) + + .org ia64_ivt+0x4400 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x4400 Entry 17 (size 64 bundles) Reserved + DBG_FAULT(17) + FAULT(17) + +ENTRY(non_syscall) + SAVE_MIN_WITH_COVER + + // There is no particular reason for this code to be here, other than that + // there happens to be space here that would go unused otherwise. If this + // fault ever gets "unreserved", simply moved the following code to a more + // suitable spot... + + alloc r14=ar.pfs,0,0,2,0 + mov out0=cr.iim + add out1=16,sp + adds r3=8,r2 // set up second base pointer for SAVE_REST + + ssm psr.ic | PSR_DEFAULT_BITS + ;; + srlz.i // guarantee that interruption collection is on + ;; +(p15) ssm psr.i // restore psr.i + movl r15=ia64_leave_kernel + ;; + SAVE_REST + mov rp=r15 + ;; + br.call.sptk.many b6=ia64_bad_break // avoid WAW on CFM and ignore return addr +END(non_syscall) + + .org ia64_ivt+0x4800 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x4800 Entry 18 (size 64 bundles) Reserved + DBG_FAULT(18) + FAULT(18) + + /* + * There is no particular reason for this code to be here, other than that + * there happens to be space here that would go unused otherwise. If this + * fault ever gets "unreserved", simply moved the following code to a more + * suitable spot... + */ + +ENTRY(dispatch_unaligned_handler) + SAVE_MIN_WITH_COVER + ;; + alloc r14=ar.pfs,0,0,2,0 // now it's safe (must be first in insn group!) + mov out0=cr.ifa + adds out1=16,sp + + ssm psr.ic | PSR_DEFAULT_BITS + ;; + srlz.i // guarantee that interruption collection is on + ;; +(p15) ssm psr.i // restore psr.i + adds r3=8,r2 // set up second base pointer + ;; + SAVE_REST + movl r14=ia64_leave_kernel + ;; + mov rp=r14 + br.sptk.many ia64_prepare_handle_unaligned +END(dispatch_unaligned_handler) + + .org ia64_ivt+0x4c00 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x4c00 Entry 19 (size 64 bundles) Reserved + DBG_FAULT(19) + FAULT(19) + + /* + * There is no particular reason for this code to be here, other than that + * there happens to be space here that would go unused otherwise. If this + * fault ever gets "unreserved", simply moved the following code to a more + * suitable spot... + */ + +ENTRY(dispatch_to_fault_handler) + /* + * Input: + * psr.ic: off + * r19: fault vector number (e.g., 24 for General Exception) + * r31: contains saved predicates (pr) + */ + SAVE_MIN_WITH_COVER_R19 + alloc r14=ar.pfs,0,0,5,0 + mov out0=r15 + mov out1=cr.isr + mov out2=cr.ifa + mov out3=cr.iim + mov out4=cr.itir + ;; + ssm psr.ic | PSR_DEFAULT_BITS + ;; + srlz.i // guarantee that interruption collection is on + ;; +(p15) ssm psr.i // restore psr.i + adds r3=8,r2 // set up second base pointer for SAVE_REST + ;; + SAVE_REST + movl r14=ia64_leave_kernel + ;; + mov rp=r14 + br.call.sptk.many b6=ia64_fault +END(dispatch_to_fault_handler) + +// +// --- End of long entries, Beginning of short entries +// + + .org ia64_ivt+0x5000 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x5000 Entry 20 (size 16 bundles) Page Not Present (10,22,49) +ENTRY(page_not_present) + DBG_FAULT(20) + mov r16=cr.ifa + rsm psr.dt + /* + * The Linux page fault handler doesn't expect non-present pages to be in + * the TLB. Flush the existing entry now, so we meet that expectation. + */ + mov r17=PAGE_SHIFT<<2 + ;; + ptc.l r16,r17 + ;; + mov r31=pr + srlz.d + br.sptk.many page_fault +END(page_not_present) + + .org ia64_ivt+0x5100 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x5100 Entry 21 (size 16 bundles) Key Permission (13,25,52) +ENTRY(key_permission) + DBG_FAULT(21) + mov r16=cr.ifa + rsm psr.dt + mov r31=pr + ;; + srlz.d + br.sptk.many page_fault +END(key_permission) + + .org ia64_ivt+0x5200 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x5200 Entry 22 (size 16 bundles) Instruction Access Rights (26) +ENTRY(iaccess_rights) + DBG_FAULT(22) + mov r16=cr.ifa + rsm psr.dt + mov r31=pr + ;; + srlz.d + br.sptk.many page_fault +END(iaccess_rights) + + .org ia64_ivt+0x5300 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x5300 Entry 23 (size 16 bundles) Data Access Rights (14,53) +ENTRY(daccess_rights) + DBG_FAULT(23) + mov r16=cr.ifa + rsm psr.dt + mov r31=pr + ;; + srlz.d + br.sptk.many page_fault +END(daccess_rights) + + .org ia64_ivt+0x5400 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x5400 Entry 24 (size 16 bundles) General Exception (5,32,34,36,38,39) +ENTRY(general_exception) + DBG_FAULT(24) + mov r16=cr.isr + mov r31=pr + ;; + cmp4.eq p6,p0=0,r16 +(p6) br.sptk.many dispatch_illegal_op_fault + ;; + mov r19=24 // fault number + br.sptk.many dispatch_to_fault_handler +END(general_exception) + + .org ia64_ivt+0x5500 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x5500 Entry 25 (size 16 bundles) Disabled FP-Register (35) +ENTRY(disabled_fp_reg) + DBG_FAULT(25) + rsm psr.dfh // ensure we can access fph + ;; + srlz.d + mov r31=pr + mov r19=25 + br.sptk.many dispatch_to_fault_handler +END(disabled_fp_reg) + + .org ia64_ivt+0x5600 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x5600 Entry 26 (size 16 bundles) Nat Consumption (11,23,37,50) +ENTRY(nat_consumption) + DBG_FAULT(26) + FAULT(26) +END(nat_consumption) + + .org ia64_ivt+0x5700 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x5700 Entry 27 (size 16 bundles) Speculation (40) +ENTRY(speculation_vector) + DBG_FAULT(27) + /* + * A [f]chk.[as] instruction needs to take the branch to the recovery code but + * this part of the architecture is not implemented in hardware on some CPUs, such + * as Itanium. Thus, in general we need to emulate the behavior. IIM contains + * the relative target (not yet sign extended). So after sign extending it we + * simply add it to IIP. We also need to reset the EI field of the IPSR to zero, + * i.e., the slot to restart into. + * + * cr.imm contains zero_ext(imm21) + */ + mov r18=cr.iim + ;; + mov r17=cr.iip + shl r18=r18,43 // put sign bit in position (43=64-21) + ;; + + mov r16=cr.ipsr + shr r18=r18,39 // sign extend (39=43-4) + ;; + + add r17=r17,r18 // now add the offset + ;; + mov cr.iip=r17 + dep r16=0,r16,41,2 // clear EI + ;; + + mov cr.ipsr=r16 + ;; + + rfi // and go back +END(speculation_vector) + + .org ia64_ivt+0x5800 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x5800 Entry 28 (size 16 bundles) Reserved + DBG_FAULT(28) + FAULT(28) + + .org ia64_ivt+0x5900 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x5900 Entry 29 (size 16 bundles) Debug (16,28,56) +ENTRY(debug_vector) + DBG_FAULT(29) + FAULT(29) +END(debug_vector) + + .org ia64_ivt+0x5a00 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x5a00 Entry 30 (size 16 bundles) Unaligned Reference (57) +ENTRY(unaligned_access) + DBG_FAULT(30) + mov r16=cr.ipsr + mov r31=pr // prepare to save predicates + ;; + br.sptk.many dispatch_unaligned_handler +END(unaligned_access) + + .org ia64_ivt+0x5b00 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x5b00 Entry 31 (size 16 bundles) Unsupported Data Reference (57) +ENTRY(unsupported_data_reference) + DBG_FAULT(31) + FAULT(31) +END(unsupported_data_reference) + + .org ia64_ivt+0x5c00 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x5c00 Entry 32 (size 16 bundles) Floating-Point Fault (64) +ENTRY(floating_point_fault) + DBG_FAULT(32) + FAULT(32) +END(floating_point_fault) + + .org ia64_ivt+0x5d00 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x5d00 Entry 33 (size 16 bundles) Floating Point Trap (66) +ENTRY(floating_point_trap) + DBG_FAULT(33) + FAULT(33) +END(floating_point_trap) + + .org ia64_ivt+0x5e00 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x5e00 Entry 34 (size 16 bundles) Lower Privilege Transfer Trap (66) +ENTRY(lower_privilege_trap) + DBG_FAULT(34) + FAULT(34) +END(lower_privilege_trap) + + .org ia64_ivt+0x5f00 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x5f00 Entry 35 (size 16 bundles) Taken Branch Trap (68) +ENTRY(taken_branch_trap) + DBG_FAULT(35) + FAULT(35) +END(taken_branch_trap) + + .org ia64_ivt+0x6000 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x6000 Entry 36 (size 16 bundles) Single Step Trap (69) +ENTRY(single_step_trap) + DBG_FAULT(36) + FAULT(36) +END(single_step_trap) + + .org ia64_ivt+0x6100 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x6100 Entry 37 (size 16 bundles) Reserved + DBG_FAULT(37) + FAULT(37) + + .org ia64_ivt+0x6200 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x6200 Entry 38 (size 16 bundles) Reserved + DBG_FAULT(38) + FAULT(38) + + .org ia64_ivt+0x6300 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x6300 Entry 39 (size 16 bundles) Reserved + DBG_FAULT(39) + FAULT(39) + + .org ia64_ivt+0x6400 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x6400 Entry 40 (size 16 bundles) Reserved + DBG_FAULT(40) + FAULT(40) + + .org ia64_ivt+0x6500 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x6500 Entry 41 (size 16 bundles) Reserved + DBG_FAULT(41) + FAULT(41) + + .org ia64_ivt+0x6600 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x6600 Entry 42 (size 16 bundles) Reserved + DBG_FAULT(42) + FAULT(42) + + .org ia64_ivt+0x6700 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x6700 Entry 43 (size 16 bundles) Reserved + DBG_FAULT(43) + FAULT(43) + + .org ia64_ivt+0x6800 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x6800 Entry 44 (size 16 bundles) Reserved + DBG_FAULT(44) + FAULT(44) + + .org ia64_ivt+0x6900 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x6900 Entry 45 (size 16 bundles) IA-32 Exeception (17,18,29,41,42,43,44,58,60,61,62,72,73,75,76,77) +ENTRY(ia32_exception) + DBG_FAULT(45) + FAULT(45) +END(ia32_exception) + + .org ia64_ivt+0x6a00 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x6a00 Entry 46 (size 16 bundles) IA-32 Intercept (30,31,59,70,71) +ENTRY(ia32_intercept) + DBG_FAULT(46) +#ifdef CONFIG_IA32_SUPPORT + mov r31=pr + mov r16=cr.isr + ;; + extr.u r17=r16,16,8 // get ISR.code + mov r18=ar.eflag + mov r19=cr.iim // old eflag value + ;; + cmp.ne p6,p0=2,r17 +(p6) br.cond.spnt 1f // not a system flag fault + xor r16=r18,r19 + ;; + extr.u r17=r16,18,1 // get the eflags.ac bit + ;; + cmp.eq p6,p0=0,r17 +(p6) br.cond.spnt 1f // eflags.ac bit didn't change + ;; + mov pr=r31,-1 // restore predicate registers + rfi + +1: +#endif // CONFIG_IA32_SUPPORT + FAULT(46) +END(ia32_intercept) + + .org ia64_ivt+0x6b00 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x6b00 Entry 47 (size 16 bundles) IA-32 Interrupt (74) +ENTRY(ia32_interrupt) + DBG_FAULT(47) +#ifdef CONFIG_IA32_SUPPORT + mov r31=pr + br.sptk.many dispatch_to_ia32_handler +#else + FAULT(47) +#endif +END(ia32_interrupt) + + .org ia64_ivt+0x6c00 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x6c00 Entry 48 (size 16 bundles) Reserved + DBG_FAULT(48) + FAULT(48) + + .org ia64_ivt+0x6d00 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x6d00 Entry 49 (size 16 bundles) Reserved + DBG_FAULT(49) + FAULT(49) + + .org ia64_ivt+0x6e00 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x6e00 Entry 50 (size 16 bundles) Reserved + DBG_FAULT(50) + FAULT(50) + + .org ia64_ivt+0x6f00 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x6f00 Entry 51 (size 16 bundles) Reserved + DBG_FAULT(51) + FAULT(51) + + .org ia64_ivt+0x7000 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x7000 Entry 52 (size 16 bundles) Reserved + DBG_FAULT(52) + FAULT(52) + + .org ia64_ivt+0x7100 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x7100 Entry 53 (size 16 bundles) Reserved + DBG_FAULT(53) + FAULT(53) + + .org ia64_ivt+0x7200 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x7200 Entry 54 (size 16 bundles) Reserved + DBG_FAULT(54) + FAULT(54) + + .org ia64_ivt+0x7300 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x7300 Entry 55 (size 16 bundles) Reserved + DBG_FAULT(55) + FAULT(55) + + .org ia64_ivt+0x7400 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x7400 Entry 56 (size 16 bundles) Reserved + DBG_FAULT(56) + FAULT(56) + + .org ia64_ivt+0x7500 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x7500 Entry 57 (size 16 bundles) Reserved + DBG_FAULT(57) + FAULT(57) + + .org ia64_ivt+0x7600 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x7600 Entry 58 (size 16 bundles) Reserved + DBG_FAULT(58) + FAULT(58) + + .org ia64_ivt+0x7700 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x7700 Entry 59 (size 16 bundles) Reserved + DBG_FAULT(59) + FAULT(59) + + .org ia64_ivt+0x7800 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x7800 Entry 60 (size 16 bundles) Reserved + DBG_FAULT(60) + FAULT(60) + + .org ia64_ivt+0x7900 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x7900 Entry 61 (size 16 bundles) Reserved + DBG_FAULT(61) + FAULT(61) + + .org ia64_ivt+0x7a00 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x7a00 Entry 62 (size 16 bundles) Reserved + DBG_FAULT(62) + FAULT(62) + + .org ia64_ivt+0x7b00 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x7b00 Entry 63 (size 16 bundles) Reserved + DBG_FAULT(63) + FAULT(63) + + .org ia64_ivt+0x7c00 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x7c00 Entry 64 (size 16 bundles) Reserved + DBG_FAULT(64) + FAULT(64) + + .org ia64_ivt+0x7d00 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x7d00 Entry 65 (size 16 bundles) Reserved + DBG_FAULT(65) + FAULT(65) + + .org ia64_ivt+0x7e00 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x7e00 Entry 66 (size 16 bundles) Reserved + DBG_FAULT(66) + FAULT(66) + + .org ia64_ivt+0x7f00 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x7f00 Entry 67 (size 16 bundles) Reserved + DBG_FAULT(67) + FAULT(67) + +#ifdef CONFIG_IA32_SUPPORT + + /* + * There is no particular reason for this code to be here, other than that + * there happens to be space here that would go unused otherwise. If this + * fault ever gets "unreserved", simply moved the following code to a more + * suitable spot... + */ + + // IA32 interrupt entry point + +ENTRY(dispatch_to_ia32_handler) + SAVE_MIN + ;; + mov r14=cr.isr + ssm psr.ic | PSR_DEFAULT_BITS + ;; + srlz.i // guarantee that interruption collection is on + ;; +(p15) ssm psr.i + adds r3=8,r2 // Base pointer for SAVE_REST + ;; + SAVE_REST + ;; + mov r15=0x80 + shr r14=r14,16 // Get interrupt number + ;; + cmp.ne p6,p0=r14,r15 +(p6) br.call.dpnt.many b6=non_ia32_syscall + + adds r14=IA64_PT_REGS_R8_OFFSET + 16,sp // 16 byte hole per SW conventions + adds r15=IA64_PT_REGS_R1_OFFSET + 16,sp + ;; + cmp.eq pSys,pNonSys=r0,r0 // set pSys=1, pNonSys=0 + ld8 r8=[r14] // get r8 + ;; + st8 [r15]=r8 // save original EAX in r1 (IA32 procs don't use the GP) + ;; + alloc r15=ar.pfs,0,0,6,0 // must first in an insn group + ;; + ld4 r8=[r14],8 // r8 == eax (syscall number) + mov r15=IA32_NR_syscalls + ;; + cmp.ltu.unc p6,p7=r8,r15 + ld4 out1=[r14],8 // r9 == ecx + ;; + ld4 out2=[r14],8 // r10 == edx + ;; + ld4 out0=[r14] // r11 == ebx + adds r14=(IA64_PT_REGS_R13_OFFSET) + 16,sp + ;; + ld4 out5=[r14],PT(R14)-PT(R13) // r13 == ebp + ;; + ld4 out3=[r14],PT(R15)-PT(R14) // r14 == esi + adds r2=TI_FLAGS+IA64_TASK_SIZE,r13 + ;; + ld4 out4=[r14] // r15 == edi + movl r16=ia32_syscall_table + ;; +(p6) shladd r16=r8,3,r16 // force ni_syscall if not valid syscall number + ld4 r2=[r2] // r2 = current_thread_info()->flags + ;; + ld8 r16=[r16] + and r2=_TIF_SYSCALL_TRACEAUDIT,r2 // mask trace or audit + ;; + mov b6=r16 + movl r15=ia32_ret_from_syscall + cmp.eq p8,p0=r2,r0 + ;; + mov rp=r15 +(p8) br.call.sptk.many b6=b6 + br.cond.sptk ia32_trace_syscall + +non_ia32_syscall: + alloc r15=ar.pfs,0,0,2,0 + mov out0=r14 // interrupt # + add out1=16,sp // pointer to pt_regs + ;; // avoid WAW on CFM + br.call.sptk.many rp=ia32_bad_interrupt +.ret1: movl r15=ia64_leave_kernel + ;; + mov rp=r15 + br.ret.sptk.many rp +END(dispatch_to_ia32_handler) + +#endif /* CONFIG_IA32_SUPPORT */ diff --git a/arch/ia64/kernel/machvec.c b/arch/ia64/kernel/machvec.c new file mode 100644 index 000000000000..c3a04ee7f4f6 --- /dev/null +++ b/arch/ia64/kernel/machvec.c @@ -0,0 +1,70 @@ +#include <linux/config.h> +#include <linux/module.h> + +#include <asm/machvec.h> +#include <asm/system.h> + +#ifdef CONFIG_IA64_GENERIC + +#include <linux/kernel.h> +#include <linux/string.h> + +#include <asm/page.h> + +struct ia64_machine_vector ia64_mv; +EXPORT_SYMBOL(ia64_mv); + +static struct ia64_machine_vector * +lookup_machvec (const char *name) +{ + extern struct ia64_machine_vector machvec_start[]; + extern struct ia64_machine_vector machvec_end[]; + struct ia64_machine_vector *mv; + + for (mv = machvec_start; mv < machvec_end; ++mv) + if (strcmp (mv->name, name) == 0) + return mv; + + return 0; +} + +void +machvec_init (const char *name) +{ + struct ia64_machine_vector *mv; + + mv = lookup_machvec(name); + if (!mv) { + panic("generic kernel failed to find machine vector for platform %s!", name); + } + ia64_mv = *mv; + printk(KERN_INFO "booting generic kernel on platform %s\n", name); +} + +#endif /* CONFIG_IA64_GENERIC */ + +void +machvec_setup (char **arg) +{ +} +EXPORT_SYMBOL(machvec_setup); + +void +machvec_timer_interrupt (int irq, void *dev_id, struct pt_regs *regs) +{ +} +EXPORT_SYMBOL(machvec_timer_interrupt); + +void +machvec_dma_sync_single (struct device *hwdev, dma_addr_t dma_handle, size_t size, int dir) +{ + mb(); +} +EXPORT_SYMBOL(machvec_dma_sync_single); + +void +machvec_dma_sync_sg (struct device *hwdev, struct scatterlist *sg, int n, int dir) +{ + mb(); +} +EXPORT_SYMBOL(machvec_dma_sync_sg); diff --git a/arch/ia64/kernel/mca.c b/arch/ia64/kernel/mca.c new file mode 100644 index 000000000000..4d6c7b8f667b --- /dev/null +++ b/arch/ia64/kernel/mca.c @@ -0,0 +1,1470 @@ +/* + * File: mca.c + * Purpose: Generic MCA handling layer + * + * Updated for latest kernel + * Copyright (C) 2003 Hewlett-Packard Co + * David Mosberger-Tang <davidm@hpl.hp.com> + * + * Copyright (C) 2002 Dell Inc. + * Copyright (C) Matt Domsch (Matt_Domsch@dell.com) + * + * Copyright (C) 2002 Intel + * Copyright (C) Jenna Hall (jenna.s.hall@intel.com) + * + * Copyright (C) 2001 Intel + * Copyright (C) Fred Lewis (frederick.v.lewis@intel.com) + * + * Copyright (C) 2000 Intel + * Copyright (C) Chuck Fleckenstein (cfleck@co.intel.com) + * + * Copyright (C) 1999, 2004 Silicon Graphics, Inc. + * Copyright (C) Vijay Chander(vijay@engr.sgi.com) + * + * 03/04/15 D. Mosberger Added INIT backtrace support. + * 02/03/25 M. Domsch GUID cleanups + * + * 02/01/04 J. Hall Aligned MCA stack to 16 bytes, added platform vs. CPU + * error flag, set SAL default return values, changed + * error record structure to linked list, added init call + * to sal_get_state_info_size(). + * + * 01/01/03 F. Lewis Added setup of CMCI and CPEI IRQs, logging of corrected + * platform errors, completed code for logging of + * corrected & uncorrected machine check errors, and + * updated for conformance with Nov. 2000 revision of the + * SAL 3.0 spec. + * 00/03/29 C. Fleckenstein Fixed PAL/SAL update issues, began MCA bug fixes, logging issues, + * added min save state dump, added INIT handler. + * + * 2003-12-08 Keith Owens <kaos@sgi.com> + * smp_call_function() must not be called from interrupt context (can + * deadlock on tasklist_lock). Use keventd to call smp_call_function(). + * + * 2004-02-01 Keith Owens <kaos@sgi.com> + * Avoid deadlock when using printk() for MCA and INIT records. + * Delete all record printing code, moved to salinfo_decode in user space. + * Mark variables and functions static where possible. + * Delete dead variables and functions. + * Reorder to remove the need for forward declarations and to consolidate + * related code. + */ +#include <linux/config.h> +#include <linux/types.h> +#include <linux/init.h> +#include <linux/sched.h> +#include <linux/interrupt.h> +#include <linux/irq.h> +#include <linux/kallsyms.h> +#include <linux/smp_lock.h> +#include <linux/bootmem.h> +#include <linux/acpi.h> +#include <linux/timer.h> +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/smp.h> +#include <linux/workqueue.h> + +#include <asm/delay.h> +#include <asm/machvec.h> +#include <asm/meminit.h> +#include <asm/page.h> +#include <asm/ptrace.h> +#include <asm/system.h> +#include <asm/sal.h> +#include <asm/mca.h> + +#include <asm/irq.h> +#include <asm/hw_irq.h> + +#if defined(IA64_MCA_DEBUG_INFO) +# define IA64_MCA_DEBUG(fmt...) printk(fmt) +#else +# define IA64_MCA_DEBUG(fmt...) +#endif + +/* Used by mca_asm.S */ +ia64_mca_sal_to_os_state_t ia64_sal_to_os_handoff_state; +ia64_mca_os_to_sal_state_t ia64_os_to_sal_handoff_state; +u64 ia64_mca_serialize; +DEFINE_PER_CPU(u64, ia64_mca_data); /* == __per_cpu_mca[smp_processor_id()] */ +DEFINE_PER_CPU(u64, ia64_mca_per_cpu_pte); /* PTE to map per-CPU area */ +DEFINE_PER_CPU(u64, ia64_mca_pal_pte); /* PTE to map PAL code */ +DEFINE_PER_CPU(u64, ia64_mca_pal_base); /* vaddr PAL code granule */ + +unsigned long __per_cpu_mca[NR_CPUS]; + +/* In mca_asm.S */ +extern void ia64_monarch_init_handler (void); +extern void ia64_slave_init_handler (void); + +static ia64_mc_info_t ia64_mc_info; + +#define MAX_CPE_POLL_INTERVAL (15*60*HZ) /* 15 minutes */ +#define MIN_CPE_POLL_INTERVAL (2*60*HZ) /* 2 minutes */ +#define CMC_POLL_INTERVAL (1*60*HZ) /* 1 minute */ +#define CPE_HISTORY_LENGTH 5 +#define CMC_HISTORY_LENGTH 5 + +static struct timer_list cpe_poll_timer; +static struct timer_list cmc_poll_timer; +/* + * This variable tells whether we are currently in polling mode. + * Start with this in the wrong state so we won't play w/ timers + * before the system is ready. + */ +static int cmc_polling_enabled = 1; + +/* + * Clearing this variable prevents CPE polling from getting activated + * in mca_late_init. Use it if your system doesn't provide a CPEI, + * but encounters problems retrieving CPE logs. This should only be + * necessary for debugging. + */ +static int cpe_poll_enabled = 1; + +extern void salinfo_log_wakeup(int type, u8 *buffer, u64 size, int irqsafe); + +static int mca_init; + +/* + * IA64_MCA log support + */ +#define IA64_MAX_LOGS 2 /* Double-buffering for nested MCAs */ +#define IA64_MAX_LOG_TYPES 4 /* MCA, INIT, CMC, CPE */ + +typedef struct ia64_state_log_s +{ + spinlock_t isl_lock; + int isl_index; + unsigned long isl_count; + ia64_err_rec_t *isl_log[IA64_MAX_LOGS]; /* need space to store header + error log */ +} ia64_state_log_t; + +static ia64_state_log_t ia64_state_log[IA64_MAX_LOG_TYPES]; + +#define IA64_LOG_ALLOCATE(it, size) \ + {ia64_state_log[it].isl_log[IA64_LOG_CURR_INDEX(it)] = \ + (ia64_err_rec_t *)alloc_bootmem(size); \ + ia64_state_log[it].isl_log[IA64_LOG_NEXT_INDEX(it)] = \ + (ia64_err_rec_t *)alloc_bootmem(size);} +#define IA64_LOG_LOCK_INIT(it) spin_lock_init(&ia64_state_log[it].isl_lock) +#define IA64_LOG_LOCK(it) spin_lock_irqsave(&ia64_state_log[it].isl_lock, s) +#define IA64_LOG_UNLOCK(it) spin_unlock_irqrestore(&ia64_state_log[it].isl_lock,s) +#define IA64_LOG_NEXT_INDEX(it) ia64_state_log[it].isl_index +#define IA64_LOG_CURR_INDEX(it) 1 - ia64_state_log[it].isl_index +#define IA64_LOG_INDEX_INC(it) \ + {ia64_state_log[it].isl_index = 1 - ia64_state_log[it].isl_index; \ + ia64_state_log[it].isl_count++;} +#define IA64_LOG_INDEX_DEC(it) \ + ia64_state_log[it].isl_index = 1 - ia64_state_log[it].isl_index +#define IA64_LOG_NEXT_BUFFER(it) (void *)((ia64_state_log[it].isl_log[IA64_LOG_NEXT_INDEX(it)])) +#define IA64_LOG_CURR_BUFFER(it) (void *)((ia64_state_log[it].isl_log[IA64_LOG_CURR_INDEX(it)])) +#define IA64_LOG_COUNT(it) ia64_state_log[it].isl_count + +/* + * ia64_log_init + * Reset the OS ia64 log buffer + * Inputs : info_type (SAL_INFO_TYPE_{MCA,INIT,CMC,CPE}) + * Outputs : None + */ +static void +ia64_log_init(int sal_info_type) +{ + u64 max_size = 0; + + IA64_LOG_NEXT_INDEX(sal_info_type) = 0; + IA64_LOG_LOCK_INIT(sal_info_type); + + // SAL will tell us the maximum size of any error record of this type + max_size = ia64_sal_get_state_info_size(sal_info_type); + if (!max_size) + /* alloc_bootmem() doesn't like zero-sized allocations! */ + return; + + // set up OS data structures to hold error info + IA64_LOG_ALLOCATE(sal_info_type, max_size); + memset(IA64_LOG_CURR_BUFFER(sal_info_type), 0, max_size); + memset(IA64_LOG_NEXT_BUFFER(sal_info_type), 0, max_size); +} + +/* + * ia64_log_get + * + * Get the current MCA log from SAL and copy it into the OS log buffer. + * + * Inputs : info_type (SAL_INFO_TYPE_{MCA,INIT,CMC,CPE}) + * irq_safe whether you can use printk at this point + * Outputs : size (total record length) + * *buffer (ptr to error record) + * + */ +static u64 +ia64_log_get(int sal_info_type, u8 **buffer, int irq_safe) +{ + sal_log_record_header_t *log_buffer; + u64 total_len = 0; + int s; + + IA64_LOG_LOCK(sal_info_type); + + /* Get the process state information */ + log_buffer = IA64_LOG_NEXT_BUFFER(sal_info_type); + + total_len = ia64_sal_get_state_info(sal_info_type, (u64 *)log_buffer); + + if (total_len) { + IA64_LOG_INDEX_INC(sal_info_type); + IA64_LOG_UNLOCK(sal_info_type); + if (irq_safe) { + IA64_MCA_DEBUG("%s: SAL error record type %d retrieved. " + "Record length = %ld\n", __FUNCTION__, sal_info_type, total_len); + } + *buffer = (u8 *) log_buffer; + return total_len; + } else { + IA64_LOG_UNLOCK(sal_info_type); + return 0; + } +} + +/* + * ia64_mca_log_sal_error_record + * + * This function retrieves a specified error record type from SAL + * and wakes up any processes waiting for error records. + * + * Inputs : sal_info_type (Type of error record MCA/CMC/CPE/INIT) + */ +static void +ia64_mca_log_sal_error_record(int sal_info_type) +{ + u8 *buffer; + sal_log_record_header_t *rh; + u64 size; + int irq_safe = sal_info_type != SAL_INFO_TYPE_MCA && sal_info_type != SAL_INFO_TYPE_INIT; +#ifdef IA64_MCA_DEBUG_INFO + static const char * const rec_name[] = { "MCA", "INIT", "CMC", "CPE" }; +#endif + + size = ia64_log_get(sal_info_type, &buffer, irq_safe); + if (!size) + return; + + salinfo_log_wakeup(sal_info_type, buffer, size, irq_safe); + + if (irq_safe) + IA64_MCA_DEBUG("CPU %d: SAL log contains %s error record\n", + smp_processor_id(), + sal_info_type < ARRAY_SIZE(rec_name) ? rec_name[sal_info_type] : "UNKNOWN"); + + /* Clear logs from corrected errors in case there's no user-level logger */ + rh = (sal_log_record_header_t *)buffer; + if (rh->severity == sal_log_severity_corrected) + ia64_sal_clear_state_info(sal_info_type); +} + +/* + * platform dependent error handling + */ +#ifndef PLATFORM_MCA_HANDLERS + +#ifdef CONFIG_ACPI + +static int cpe_vector = -1; + +static irqreturn_t +ia64_mca_cpe_int_handler (int cpe_irq, void *arg, struct pt_regs *ptregs) +{ + static unsigned long cpe_history[CPE_HISTORY_LENGTH]; + static int index; + static DEFINE_SPINLOCK(cpe_history_lock); + + IA64_MCA_DEBUG("%s: received interrupt vector = %#x on CPU %d\n", + __FUNCTION__, cpe_irq, smp_processor_id()); + + /* SAL spec states this should run w/ interrupts enabled */ + local_irq_enable(); + + /* Get the CPE error record and log it */ + ia64_mca_log_sal_error_record(SAL_INFO_TYPE_CPE); + + spin_lock(&cpe_history_lock); + if (!cpe_poll_enabled && cpe_vector >= 0) { + + int i, count = 1; /* we know 1 happened now */ + unsigned long now = jiffies; + + for (i = 0; i < CPE_HISTORY_LENGTH; i++) { + if (now - cpe_history[i] <= HZ) + count++; + } + + IA64_MCA_DEBUG(KERN_INFO "CPE threshold %d/%d\n", count, CPE_HISTORY_LENGTH); + if (count >= CPE_HISTORY_LENGTH) { + + cpe_poll_enabled = 1; + spin_unlock(&cpe_history_lock); + disable_irq_nosync(local_vector_to_irq(IA64_CPE_VECTOR)); + + /* + * Corrected errors will still be corrected, but + * make sure there's a log somewhere that indicates + * something is generating more than we can handle. + */ + printk(KERN_WARNING "WARNING: Switching to polling CPE handler; error records may be lost\n"); + + mod_timer(&cpe_poll_timer, jiffies + MIN_CPE_POLL_INTERVAL); + + /* lock already released, get out now */ + return IRQ_HANDLED; + } else { + cpe_history[index++] = now; + if (index == CPE_HISTORY_LENGTH) + index = 0; + } + } + spin_unlock(&cpe_history_lock); + return IRQ_HANDLED; +} + +#endif /* CONFIG_ACPI */ + +static void +show_min_state (pal_min_state_area_t *minstate) +{ + u64 iip = minstate->pmsa_iip + ((struct ia64_psr *)(&minstate->pmsa_ipsr))->ri; + u64 xip = minstate->pmsa_xip + ((struct ia64_psr *)(&minstate->pmsa_xpsr))->ri; + + printk("NaT bits\t%016lx\n", minstate->pmsa_nat_bits); + printk("pr\t\t%016lx\n", minstate->pmsa_pr); + printk("b0\t\t%016lx ", minstate->pmsa_br0); print_symbol("%s\n", minstate->pmsa_br0); + printk("ar.rsc\t\t%016lx\n", minstate->pmsa_rsc); + printk("cr.iip\t\t%016lx ", iip); print_symbol("%s\n", iip); + printk("cr.ipsr\t\t%016lx\n", minstate->pmsa_ipsr); + printk("cr.ifs\t\t%016lx\n", minstate->pmsa_ifs); + printk("xip\t\t%016lx ", xip); print_symbol("%s\n", xip); + printk("xpsr\t\t%016lx\n", minstate->pmsa_xpsr); + printk("xfs\t\t%016lx\n", minstate->pmsa_xfs); + printk("b1\t\t%016lx ", minstate->pmsa_br1); + print_symbol("%s\n", minstate->pmsa_br1); + + printk("\nstatic registers r0-r15:\n"); + printk(" r0- 3 %016lx %016lx %016lx %016lx\n", + 0UL, minstate->pmsa_gr[0], minstate->pmsa_gr[1], minstate->pmsa_gr[2]); + printk(" r4- 7 %016lx %016lx %016lx %016lx\n", + minstate->pmsa_gr[3], minstate->pmsa_gr[4], + minstate->pmsa_gr[5], minstate->pmsa_gr[6]); + printk(" r8-11 %016lx %016lx %016lx %016lx\n", + minstate->pmsa_gr[7], minstate->pmsa_gr[8], + minstate->pmsa_gr[9], minstate->pmsa_gr[10]); + printk("r12-15 %016lx %016lx %016lx %016lx\n", + minstate->pmsa_gr[11], minstate->pmsa_gr[12], + minstate->pmsa_gr[13], minstate->pmsa_gr[14]); + + printk("\nbank 0:\n"); + printk("r16-19 %016lx %016lx %016lx %016lx\n", + minstate->pmsa_bank0_gr[0], minstate->pmsa_bank0_gr[1], + minstate->pmsa_bank0_gr[2], minstate->pmsa_bank0_gr[3]); + printk("r20-23 %016lx %016lx %016lx %016lx\n", + minstate->pmsa_bank0_gr[4], minstate->pmsa_bank0_gr[5], + minstate->pmsa_bank0_gr[6], minstate->pmsa_bank0_gr[7]); + printk("r24-27 %016lx %016lx %016lx %016lx\n", + minstate->pmsa_bank0_gr[8], minstate->pmsa_bank0_gr[9], + minstate->pmsa_bank0_gr[10], minstate->pmsa_bank0_gr[11]); + printk("r28-31 %016lx %016lx %016lx %016lx\n", + minstate->pmsa_bank0_gr[12], minstate->pmsa_bank0_gr[13], + minstate->pmsa_bank0_gr[14], minstate->pmsa_bank0_gr[15]); + + printk("\nbank 1:\n"); + printk("r16-19 %016lx %016lx %016lx %016lx\n", + minstate->pmsa_bank1_gr[0], minstate->pmsa_bank1_gr[1], + minstate->pmsa_bank1_gr[2], minstate->pmsa_bank1_gr[3]); + printk("r20-23 %016lx %016lx %016lx %016lx\n", + minstate->pmsa_bank1_gr[4], minstate->pmsa_bank1_gr[5], + minstate->pmsa_bank1_gr[6], minstate->pmsa_bank1_gr[7]); + printk("r24-27 %016lx %016lx %016lx %016lx\n", + minstate->pmsa_bank1_gr[8], minstate->pmsa_bank1_gr[9], + minstate->pmsa_bank1_gr[10], minstate->pmsa_bank1_gr[11]); + printk("r28-31 %016lx %016lx %016lx %016lx\n", + minstate->pmsa_bank1_gr[12], minstate->pmsa_bank1_gr[13], + minstate->pmsa_bank1_gr[14], minstate->pmsa_bank1_gr[15]); +} + +static void +fetch_min_state (pal_min_state_area_t *ms, struct pt_regs *pt, struct switch_stack *sw) +{ + u64 *dst_banked, *src_banked, bit, shift, nat_bits; + int i; + + /* + * First, update the pt-regs and switch-stack structures with the contents stored + * in the min-state area: + */ + if (((struct ia64_psr *) &ms->pmsa_ipsr)->ic == 0) { + pt->cr_ipsr = ms->pmsa_xpsr; + pt->cr_iip = ms->pmsa_xip; + pt->cr_ifs = ms->pmsa_xfs; + } else { + pt->cr_ipsr = ms->pmsa_ipsr; + pt->cr_iip = ms->pmsa_iip; + pt->cr_ifs = ms->pmsa_ifs; + } + pt->ar_rsc = ms->pmsa_rsc; + pt->pr = ms->pmsa_pr; + pt->r1 = ms->pmsa_gr[0]; + pt->r2 = ms->pmsa_gr[1]; + pt->r3 = ms->pmsa_gr[2]; + sw->r4 = ms->pmsa_gr[3]; + sw->r5 = ms->pmsa_gr[4]; + sw->r6 = ms->pmsa_gr[5]; + sw->r7 = ms->pmsa_gr[6]; + pt->r8 = ms->pmsa_gr[7]; + pt->r9 = ms->pmsa_gr[8]; + pt->r10 = ms->pmsa_gr[9]; + pt->r11 = ms->pmsa_gr[10]; + pt->r12 = ms->pmsa_gr[11]; + pt->r13 = ms->pmsa_gr[12]; + pt->r14 = ms->pmsa_gr[13]; + pt->r15 = ms->pmsa_gr[14]; + dst_banked = &pt->r16; /* r16-r31 are contiguous in struct pt_regs */ + src_banked = ms->pmsa_bank1_gr; + for (i = 0; i < 16; ++i) + dst_banked[i] = src_banked[i]; + pt->b0 = ms->pmsa_br0; + sw->b1 = ms->pmsa_br1; + + /* construct the NaT bits for the pt-regs structure: */ +# define PUT_NAT_BIT(dst, addr) \ + do { \ + bit = nat_bits & 1; nat_bits >>= 1; \ + shift = ((unsigned long) addr >> 3) & 0x3f; \ + dst = ((dst) & ~(1UL << shift)) | (bit << shift); \ + } while (0) + + /* Rotate the saved NaT bits such that bit 0 corresponds to pmsa_gr[0]: */ + shift = ((unsigned long) &ms->pmsa_gr[0] >> 3) & 0x3f; + nat_bits = (ms->pmsa_nat_bits >> shift) | (ms->pmsa_nat_bits << (64 - shift)); + + PUT_NAT_BIT(sw->caller_unat, &pt->r1); + PUT_NAT_BIT(sw->caller_unat, &pt->r2); + PUT_NAT_BIT(sw->caller_unat, &pt->r3); + PUT_NAT_BIT(sw->ar_unat, &sw->r4); + PUT_NAT_BIT(sw->ar_unat, &sw->r5); + PUT_NAT_BIT(sw->ar_unat, &sw->r6); + PUT_NAT_BIT(sw->ar_unat, &sw->r7); + PUT_NAT_BIT(sw->caller_unat, &pt->r8); PUT_NAT_BIT(sw->caller_unat, &pt->r9); + PUT_NAT_BIT(sw->caller_unat, &pt->r10); PUT_NAT_BIT(sw->caller_unat, &pt->r11); + PUT_NAT_BIT(sw->caller_unat, &pt->r12); PUT_NAT_BIT(sw->caller_unat, &pt->r13); + PUT_NAT_BIT(sw->caller_unat, &pt->r14); PUT_NAT_BIT(sw->caller_unat, &pt->r15); + nat_bits >>= 16; /* skip over bank0 NaT bits */ + PUT_NAT_BIT(sw->caller_unat, &pt->r16); PUT_NAT_BIT(sw->caller_unat, &pt->r17); + PUT_NAT_BIT(sw->caller_unat, &pt->r18); PUT_NAT_BIT(sw->caller_unat, &pt->r19); + PUT_NAT_BIT(sw->caller_unat, &pt->r20); PUT_NAT_BIT(sw->caller_unat, &pt->r21); + PUT_NAT_BIT(sw->caller_unat, &pt->r22); PUT_NAT_BIT(sw->caller_unat, &pt->r23); + PUT_NAT_BIT(sw->caller_unat, &pt->r24); PUT_NAT_BIT(sw->caller_unat, &pt->r25); + PUT_NAT_BIT(sw->caller_unat, &pt->r26); PUT_NAT_BIT(sw->caller_unat, &pt->r27); + PUT_NAT_BIT(sw->caller_unat, &pt->r28); PUT_NAT_BIT(sw->caller_unat, &pt->r29); + PUT_NAT_BIT(sw->caller_unat, &pt->r30); PUT_NAT_BIT(sw->caller_unat, &pt->r31); +} + +static void +init_handler_platform (pal_min_state_area_t *ms, + struct pt_regs *pt, struct switch_stack *sw) +{ + struct unw_frame_info info; + + /* if a kernel debugger is available call it here else just dump the registers */ + + /* + * Wait for a bit. On some machines (e.g., HP's zx2000 and zx6000, INIT can be + * generated via the BMC's command-line interface, but since the console is on the + * same serial line, the user will need some time to switch out of the BMC before + * the dump begins. + */ + printk("Delaying for 5 seconds...\n"); + udelay(5*1000000); + show_min_state(ms); + + printk("Backtrace of current task (pid %d, %s)\n", current->pid, current->comm); + fetch_min_state(ms, pt, sw); + unw_init_from_interruption(&info, current, pt, sw); + ia64_do_show_stack(&info, NULL); + +#ifdef CONFIG_SMP + /* read_trylock() would be handy... */ + if (!tasklist_lock.write_lock) + read_lock(&tasklist_lock); +#endif + { + struct task_struct *g, *t; + do_each_thread (g, t) { + if (t == current) + continue; + + printk("\nBacktrace of pid %d (%s)\n", t->pid, t->comm); + show_stack(t, NULL); + } while_each_thread (g, t); + } +#ifdef CONFIG_SMP + if (!tasklist_lock.write_lock) + read_unlock(&tasklist_lock); +#endif + + printk("\nINIT dump complete. Please reboot now.\n"); + while (1); /* hang city if no debugger */ +} + +#ifdef CONFIG_ACPI +/* + * ia64_mca_register_cpev + * + * Register the corrected platform error vector with SAL. + * + * Inputs + * cpev Corrected Platform Error Vector number + * + * Outputs + * None + */ +static void +ia64_mca_register_cpev (int cpev) +{ + /* Register the CPE interrupt vector with SAL */ + struct ia64_sal_retval isrv; + + isrv = ia64_sal_mc_set_params(SAL_MC_PARAM_CPE_INT, SAL_MC_PARAM_MECHANISM_INT, cpev, 0, 0); + if (isrv.status) { + printk(KERN_ERR "Failed to register Corrected Platform " + "Error interrupt vector with SAL (status %ld)\n", isrv.status); + return; + } + + IA64_MCA_DEBUG("%s: corrected platform error " + "vector %#x registered\n", __FUNCTION__, cpev); +} +#endif /* CONFIG_ACPI */ + +#endif /* PLATFORM_MCA_HANDLERS */ + +/* + * ia64_mca_cmc_vector_setup + * + * Setup the corrected machine check vector register in the processor. + * (The interrupt is masked on boot. ia64_mca_late_init unmask this.) + * This function is invoked on a per-processor basis. + * + * Inputs + * None + * + * Outputs + * None + */ +void +ia64_mca_cmc_vector_setup (void) +{ + cmcv_reg_t cmcv; + + cmcv.cmcv_regval = 0; + cmcv.cmcv_mask = 1; /* Mask/disable interrupt at first */ + cmcv.cmcv_vector = IA64_CMC_VECTOR; + ia64_setreg(_IA64_REG_CR_CMCV, cmcv.cmcv_regval); + + IA64_MCA_DEBUG("%s: CPU %d corrected " + "machine check vector %#x registered.\n", + __FUNCTION__, smp_processor_id(), IA64_CMC_VECTOR); + + IA64_MCA_DEBUG("%s: CPU %d CMCV = %#016lx\n", + __FUNCTION__, smp_processor_id(), ia64_getreg(_IA64_REG_CR_CMCV)); +} + +/* + * ia64_mca_cmc_vector_disable + * + * Mask the corrected machine check vector register in the processor. + * This function is invoked on a per-processor basis. + * + * Inputs + * dummy(unused) + * + * Outputs + * None + */ +static void +ia64_mca_cmc_vector_disable (void *dummy) +{ + cmcv_reg_t cmcv; + + cmcv.cmcv_regval = ia64_getreg(_IA64_REG_CR_CMCV); + + cmcv.cmcv_mask = 1; /* Mask/disable interrupt */ + ia64_setreg(_IA64_REG_CR_CMCV, cmcv.cmcv_regval); + + IA64_MCA_DEBUG("%s: CPU %d corrected " + "machine check vector %#x disabled.\n", + __FUNCTION__, smp_processor_id(), cmcv.cmcv_vector); +} + +/* + * ia64_mca_cmc_vector_enable + * + * Unmask the corrected machine check vector register in the processor. + * This function is invoked on a per-processor basis. + * + * Inputs + * dummy(unused) + * + * Outputs + * None + */ +static void +ia64_mca_cmc_vector_enable (void *dummy) +{ + cmcv_reg_t cmcv; + + cmcv.cmcv_regval = ia64_getreg(_IA64_REG_CR_CMCV); + + cmcv.cmcv_mask = 0; /* Unmask/enable interrupt */ + ia64_setreg(_IA64_REG_CR_CMCV, cmcv.cmcv_regval); + + IA64_MCA_DEBUG("%s: CPU %d corrected " + "machine check vector %#x enabled.\n", + __FUNCTION__, smp_processor_id(), cmcv.cmcv_vector); +} + +/* + * ia64_mca_cmc_vector_disable_keventd + * + * Called via keventd (smp_call_function() is not safe in interrupt context) to + * disable the cmc interrupt vector. + */ +static void +ia64_mca_cmc_vector_disable_keventd(void *unused) +{ + on_each_cpu(ia64_mca_cmc_vector_disable, NULL, 1, 0); +} + +/* + * ia64_mca_cmc_vector_enable_keventd + * + * Called via keventd (smp_call_function() is not safe in interrupt context) to + * enable the cmc interrupt vector. + */ +static void +ia64_mca_cmc_vector_enable_keventd(void *unused) +{ + on_each_cpu(ia64_mca_cmc_vector_enable, NULL, 1, 0); +} + +/* + * ia64_mca_wakeup_ipi_wait + * + * Wait for the inter-cpu interrupt to be sent by the + * monarch processor once it is done with handling the + * MCA. + * + * Inputs : None + * Outputs : None + */ +static void +ia64_mca_wakeup_ipi_wait(void) +{ + int irr_num = (IA64_MCA_WAKEUP_VECTOR >> 6); + int irr_bit = (IA64_MCA_WAKEUP_VECTOR & 0x3f); + u64 irr = 0; + + do { + switch(irr_num) { + case 0: + irr = ia64_getreg(_IA64_REG_CR_IRR0); + break; + case 1: + irr = ia64_getreg(_IA64_REG_CR_IRR1); + break; + case 2: + irr = ia64_getreg(_IA64_REG_CR_IRR2); + break; + case 3: + irr = ia64_getreg(_IA64_REG_CR_IRR3); + break; + } + cpu_relax(); + } while (!(irr & (1UL << irr_bit))) ; +} + +/* + * ia64_mca_wakeup + * + * Send an inter-cpu interrupt to wake-up a particular cpu + * and mark that cpu to be out of rendez. + * + * Inputs : cpuid + * Outputs : None + */ +static void +ia64_mca_wakeup(int cpu) +{ + platform_send_ipi(cpu, IA64_MCA_WAKEUP_VECTOR, IA64_IPI_DM_INT, 0); + ia64_mc_info.imi_rendez_checkin[cpu] = IA64_MCA_RENDEZ_CHECKIN_NOTDONE; + +} + +/* + * ia64_mca_wakeup_all + * + * Wakeup all the cpus which have rendez'ed previously. + * + * Inputs : None + * Outputs : None + */ +static void +ia64_mca_wakeup_all(void) +{ + int cpu; + + /* Clear the Rendez checkin flag for all cpus */ + for(cpu = 0; cpu < NR_CPUS; cpu++) { + if (!cpu_online(cpu)) + continue; + if (ia64_mc_info.imi_rendez_checkin[cpu] == IA64_MCA_RENDEZ_CHECKIN_DONE) + ia64_mca_wakeup(cpu); + } + +} + +/* + * ia64_mca_rendez_interrupt_handler + * + * This is handler used to put slave processors into spinloop + * while the monarch processor does the mca handling and later + * wake each slave up once the monarch is done. + * + * Inputs : None + * Outputs : None + */ +static irqreturn_t +ia64_mca_rendez_int_handler(int rendez_irq, void *arg, struct pt_regs *ptregs) +{ + unsigned long flags; + int cpu = smp_processor_id(); + + /* Mask all interrupts */ + local_irq_save(flags); + + ia64_mc_info.imi_rendez_checkin[cpu] = IA64_MCA_RENDEZ_CHECKIN_DONE; + /* Register with the SAL monarch that the slave has + * reached SAL + */ + ia64_sal_mc_rendez(); + + /* Wait for the wakeup IPI from the monarch + * This waiting is done by polling on the wakeup-interrupt + * vector bit in the processor's IRRs + */ + ia64_mca_wakeup_ipi_wait(); + + /* Enable all interrupts */ + local_irq_restore(flags); + return IRQ_HANDLED; +} + +/* + * ia64_mca_wakeup_int_handler + * + * The interrupt handler for processing the inter-cpu interrupt to the + * slave cpu which was spinning in the rendez loop. + * Since this spinning is done by turning off the interrupts and + * polling on the wakeup-interrupt bit in the IRR, there is + * nothing useful to be done in the handler. + * + * Inputs : wakeup_irq (Wakeup-interrupt bit) + * arg (Interrupt handler specific argument) + * ptregs (Exception frame at the time of the interrupt) + * Outputs : None + * + */ +static irqreturn_t +ia64_mca_wakeup_int_handler(int wakeup_irq, void *arg, struct pt_regs *ptregs) +{ + return IRQ_HANDLED; +} + +/* + * ia64_return_to_sal_check + * + * This is function called before going back from the OS_MCA handler + * to the OS_MCA dispatch code which finally takes the control back + * to the SAL. + * The main purpose of this routine is to setup the OS_MCA to SAL + * return state which can be used by the OS_MCA dispatch code + * just before going back to SAL. + * + * Inputs : None + * Outputs : None + */ + +static void +ia64_return_to_sal_check(int recover) +{ + + /* Copy over some relevant stuff from the sal_to_os_mca_handoff + * so that it can be used at the time of os_mca_to_sal_handoff + */ + ia64_os_to_sal_handoff_state.imots_sal_gp = + ia64_sal_to_os_handoff_state.imsto_sal_gp; + + ia64_os_to_sal_handoff_state.imots_sal_check_ra = + ia64_sal_to_os_handoff_state.imsto_sal_check_ra; + + if (recover) + ia64_os_to_sal_handoff_state.imots_os_status = IA64_MCA_CORRECTED; + else + ia64_os_to_sal_handoff_state.imots_os_status = IA64_MCA_COLD_BOOT; + + /* Default = tell SAL to return to same context */ + ia64_os_to_sal_handoff_state.imots_context = IA64_MCA_SAME_CONTEXT; + + ia64_os_to_sal_handoff_state.imots_new_min_state = + (u64 *)ia64_sal_to_os_handoff_state.pal_min_state; + +} + +/* Function pointer for extra MCA recovery */ +int (*ia64_mca_ucmc_extension) + (void*,ia64_mca_sal_to_os_state_t*,ia64_mca_os_to_sal_state_t*) + = NULL; + +int +ia64_reg_MCA_extension(void *fn) +{ + if (ia64_mca_ucmc_extension) + return 1; + + ia64_mca_ucmc_extension = fn; + return 0; +} + +void +ia64_unreg_MCA_extension(void) +{ + if (ia64_mca_ucmc_extension) + ia64_mca_ucmc_extension = NULL; +} + +EXPORT_SYMBOL(ia64_reg_MCA_extension); +EXPORT_SYMBOL(ia64_unreg_MCA_extension); + +/* + * ia64_mca_ucmc_handler + * + * This is uncorrectable machine check handler called from OS_MCA + * dispatch code which is in turn called from SAL_CHECK(). + * This is the place where the core of OS MCA handling is done. + * Right now the logs are extracted and displayed in a well-defined + * format. This handler code is supposed to be run only on the + * monarch processor. Once the monarch is done with MCA handling + * further MCA logging is enabled by clearing logs. + * Monarch also has the duty of sending wakeup-IPIs to pull the + * slave processors out of rendezvous spinloop. + * + * Inputs : None + * Outputs : None + */ +void +ia64_mca_ucmc_handler(void) +{ + pal_processor_state_info_t *psp = (pal_processor_state_info_t *) + &ia64_sal_to_os_handoff_state.proc_state_param; + int recover; + + /* Get the MCA error record and log it */ + ia64_mca_log_sal_error_record(SAL_INFO_TYPE_MCA); + + /* TLB error is only exist in this SAL error record */ + recover = (psp->tc && !(psp->cc || psp->bc || psp->rc || psp->uc)) + /* other error recovery */ + || (ia64_mca_ucmc_extension + && ia64_mca_ucmc_extension( + IA64_LOG_CURR_BUFFER(SAL_INFO_TYPE_MCA), + &ia64_sal_to_os_handoff_state, + &ia64_os_to_sal_handoff_state)); + + if (recover) { + sal_log_record_header_t *rh = IA64_LOG_CURR_BUFFER(SAL_INFO_TYPE_MCA); + rh->severity = sal_log_severity_corrected; + ia64_sal_clear_state_info(SAL_INFO_TYPE_MCA); + } + /* + * Wakeup all the processors which are spinning in the rendezvous + * loop. + */ + ia64_mca_wakeup_all(); + + /* Return to SAL */ + ia64_return_to_sal_check(recover); +} + +static DECLARE_WORK(cmc_disable_work, ia64_mca_cmc_vector_disable_keventd, NULL); +static DECLARE_WORK(cmc_enable_work, ia64_mca_cmc_vector_enable_keventd, NULL); + +/* + * ia64_mca_cmc_int_handler + * + * This is corrected machine check interrupt handler. + * Right now the logs are extracted and displayed in a well-defined + * format. + * + * Inputs + * interrupt number + * client data arg ptr + * saved registers ptr + * + * Outputs + * None + */ +static irqreturn_t +ia64_mca_cmc_int_handler(int cmc_irq, void *arg, struct pt_regs *ptregs) +{ + static unsigned long cmc_history[CMC_HISTORY_LENGTH]; + static int index; + static DEFINE_SPINLOCK(cmc_history_lock); + + IA64_MCA_DEBUG("%s: received interrupt vector = %#x on CPU %d\n", + __FUNCTION__, cmc_irq, smp_processor_id()); + + /* SAL spec states this should run w/ interrupts enabled */ + local_irq_enable(); + + /* Get the CMC error record and log it */ + ia64_mca_log_sal_error_record(SAL_INFO_TYPE_CMC); + + spin_lock(&cmc_history_lock); + if (!cmc_polling_enabled) { + int i, count = 1; /* we know 1 happened now */ + unsigned long now = jiffies; + + for (i = 0; i < CMC_HISTORY_LENGTH; i++) { + if (now - cmc_history[i] <= HZ) + count++; + } + + IA64_MCA_DEBUG(KERN_INFO "CMC threshold %d/%d\n", count, CMC_HISTORY_LENGTH); + if (count >= CMC_HISTORY_LENGTH) { + + cmc_polling_enabled = 1; + spin_unlock(&cmc_history_lock); + schedule_work(&cmc_disable_work); + + /* + * Corrected errors will still be corrected, but + * make sure there's a log somewhere that indicates + * something is generating more than we can handle. + */ + printk(KERN_WARNING "WARNING: Switching to polling CMC handler; error records may be lost\n"); + + mod_timer(&cmc_poll_timer, jiffies + CMC_POLL_INTERVAL); + + /* lock already released, get out now */ + return IRQ_HANDLED; + } else { + cmc_history[index++] = now; + if (index == CMC_HISTORY_LENGTH) + index = 0; + } + } + spin_unlock(&cmc_history_lock); + return IRQ_HANDLED; +} + +/* + * ia64_mca_cmc_int_caller + * + * Triggered by sw interrupt from CMC polling routine. Calls + * real interrupt handler and either triggers a sw interrupt + * on the next cpu or does cleanup at the end. + * + * Inputs + * interrupt number + * client data arg ptr + * saved registers ptr + * Outputs + * handled + */ +static irqreturn_t +ia64_mca_cmc_int_caller(int cmc_irq, void *arg, struct pt_regs *ptregs) +{ + static int start_count = -1; + unsigned int cpuid; + + cpuid = smp_processor_id(); + + /* If first cpu, update count */ + if (start_count == -1) + start_count = IA64_LOG_COUNT(SAL_INFO_TYPE_CMC); + + ia64_mca_cmc_int_handler(cmc_irq, arg, ptregs); + + for (++cpuid ; cpuid < NR_CPUS && !cpu_online(cpuid) ; cpuid++); + + if (cpuid < NR_CPUS) { + platform_send_ipi(cpuid, IA64_CMCP_VECTOR, IA64_IPI_DM_INT, 0); + } else { + /* If no log record, switch out of polling mode */ + if (start_count == IA64_LOG_COUNT(SAL_INFO_TYPE_CMC)) { + + printk(KERN_WARNING "Returning to interrupt driven CMC handler\n"); + schedule_work(&cmc_enable_work); + cmc_polling_enabled = 0; + + } else { + + mod_timer(&cmc_poll_timer, jiffies + CMC_POLL_INTERVAL); + } + + start_count = -1; + } + + return IRQ_HANDLED; +} + +/* + * ia64_mca_cmc_poll + * + * Poll for Corrected Machine Checks (CMCs) + * + * Inputs : dummy(unused) + * Outputs : None + * + */ +static void +ia64_mca_cmc_poll (unsigned long dummy) +{ + /* Trigger a CMC interrupt cascade */ + platform_send_ipi(first_cpu(cpu_online_map), IA64_CMCP_VECTOR, IA64_IPI_DM_INT, 0); +} + +/* + * ia64_mca_cpe_int_caller + * + * Triggered by sw interrupt from CPE polling routine. Calls + * real interrupt handler and either triggers a sw interrupt + * on the next cpu or does cleanup at the end. + * + * Inputs + * interrupt number + * client data arg ptr + * saved registers ptr + * Outputs + * handled + */ +#ifdef CONFIG_ACPI + +static irqreturn_t +ia64_mca_cpe_int_caller(int cpe_irq, void *arg, struct pt_regs *ptregs) +{ + static int start_count = -1; + static int poll_time = MIN_CPE_POLL_INTERVAL; + unsigned int cpuid; + + cpuid = smp_processor_id(); + + /* If first cpu, update count */ + if (start_count == -1) + start_count = IA64_LOG_COUNT(SAL_INFO_TYPE_CPE); + + ia64_mca_cpe_int_handler(cpe_irq, arg, ptregs); + + for (++cpuid ; cpuid < NR_CPUS && !cpu_online(cpuid) ; cpuid++); + + if (cpuid < NR_CPUS) { + platform_send_ipi(cpuid, IA64_CPEP_VECTOR, IA64_IPI_DM_INT, 0); + } else { + /* + * If a log was recorded, increase our polling frequency, + * otherwise, backoff or return to interrupt mode. + */ + if (start_count != IA64_LOG_COUNT(SAL_INFO_TYPE_CPE)) { + poll_time = max(MIN_CPE_POLL_INTERVAL, poll_time / 2); + } else if (cpe_vector < 0) { + poll_time = min(MAX_CPE_POLL_INTERVAL, poll_time * 2); + } else { + poll_time = MIN_CPE_POLL_INTERVAL; + + printk(KERN_WARNING "Returning to interrupt driven CPE handler\n"); + enable_irq(local_vector_to_irq(IA64_CPE_VECTOR)); + cpe_poll_enabled = 0; + } + + if (cpe_poll_enabled) + mod_timer(&cpe_poll_timer, jiffies + poll_time); + start_count = -1; + } + + return IRQ_HANDLED; +} + +#endif /* CONFIG_ACPI */ + +/* + * ia64_mca_cpe_poll + * + * Poll for Corrected Platform Errors (CPEs), trigger interrupt + * on first cpu, from there it will trickle through all the cpus. + * + * Inputs : dummy(unused) + * Outputs : None + * + */ +static void +ia64_mca_cpe_poll (unsigned long dummy) +{ + /* Trigger a CPE interrupt cascade */ + platform_send_ipi(first_cpu(cpu_online_map), IA64_CPEP_VECTOR, IA64_IPI_DM_INT, 0); +} + +/* + * C portion of the OS INIT handler + * + * Called from ia64_monarch_init_handler + * + * Inputs: pointer to pt_regs where processor info was saved. + * + * Returns: + * 0 if SAL must warm boot the System + * 1 if SAL must return to interrupted context using PAL_MC_RESUME + * + */ +void +ia64_init_handler (struct pt_regs *pt, struct switch_stack *sw) +{ + pal_min_state_area_t *ms; + + oops_in_progress = 1; /* avoid deadlock in printk, but it makes recovery dodgy */ + console_loglevel = 15; /* make sure printks make it to console */ + + printk(KERN_INFO "Entered OS INIT handler. PSP=%lx\n", + ia64_sal_to_os_handoff_state.proc_state_param); + + /* + * Address of minstate area provided by PAL is physical, + * uncacheable (bit 63 set). Convert to Linux virtual + * address in region 6. + */ + ms = (pal_min_state_area_t *)(ia64_sal_to_os_handoff_state.pal_min_state | (6ul<<61)); + + init_handler_platform(ms, pt, sw); /* call platform specific routines */ +} + +static int __init +ia64_mca_disable_cpe_polling(char *str) +{ + cpe_poll_enabled = 0; + return 1; +} + +__setup("disable_cpe_poll", ia64_mca_disable_cpe_polling); + +static struct irqaction cmci_irqaction = { + .handler = ia64_mca_cmc_int_handler, + .flags = SA_INTERRUPT, + .name = "cmc_hndlr" +}; + +static struct irqaction cmcp_irqaction = { + .handler = ia64_mca_cmc_int_caller, + .flags = SA_INTERRUPT, + .name = "cmc_poll" +}; + +static struct irqaction mca_rdzv_irqaction = { + .handler = ia64_mca_rendez_int_handler, + .flags = SA_INTERRUPT, + .name = "mca_rdzv" +}; + +static struct irqaction mca_wkup_irqaction = { + .handler = ia64_mca_wakeup_int_handler, + .flags = SA_INTERRUPT, + .name = "mca_wkup" +}; + +#ifdef CONFIG_ACPI +static struct irqaction mca_cpe_irqaction = { + .handler = ia64_mca_cpe_int_handler, + .flags = SA_INTERRUPT, + .name = "cpe_hndlr" +}; + +static struct irqaction mca_cpep_irqaction = { + .handler = ia64_mca_cpe_int_caller, + .flags = SA_INTERRUPT, + .name = "cpe_poll" +}; +#endif /* CONFIG_ACPI */ + +/* Do per-CPU MCA-related initialization. */ + +void __devinit +ia64_mca_cpu_init(void *cpu_data) +{ + void *pal_vaddr; + + if (smp_processor_id() == 0) { + void *mca_data; + int cpu; + + mca_data = alloc_bootmem(sizeof(struct ia64_mca_cpu) + * NR_CPUS); + for (cpu = 0; cpu < NR_CPUS; cpu++) { + __per_cpu_mca[cpu] = __pa(mca_data); + mca_data += sizeof(struct ia64_mca_cpu); + } + } + + /* + * The MCA info structure was allocated earlier and its + * physical address saved in __per_cpu_mca[cpu]. Copy that + * address * to ia64_mca_data so we can access it as a per-CPU + * variable. + */ + __get_cpu_var(ia64_mca_data) = __per_cpu_mca[smp_processor_id()]; + + /* + * Stash away a copy of the PTE needed to map the per-CPU page. + * We may need it during MCA recovery. + */ + __get_cpu_var(ia64_mca_per_cpu_pte) = + pte_val(mk_pte_phys(__pa(cpu_data), PAGE_KERNEL)); + + /* + * Also, stash away a copy of the PAL address and the PTE + * needed to map it. + */ + pal_vaddr = efi_get_pal_addr(); + if (!pal_vaddr) + return; + __get_cpu_var(ia64_mca_pal_base) = + GRANULEROUNDDOWN((unsigned long) pal_vaddr); + __get_cpu_var(ia64_mca_pal_pte) = pte_val(mk_pte_phys(__pa(pal_vaddr), + PAGE_KERNEL)); +} + +/* + * ia64_mca_init + * + * Do all the system level mca specific initialization. + * + * 1. Register spinloop and wakeup request interrupt vectors + * + * 2. Register OS_MCA handler entry point + * + * 3. Register OS_INIT handler entry point + * + * 4. Initialize MCA/CMC/INIT related log buffers maintained by the OS. + * + * Note that this initialization is done very early before some kernel + * services are available. + * + * Inputs : None + * + * Outputs : None + */ +void __init +ia64_mca_init(void) +{ + ia64_fptr_t *mon_init_ptr = (ia64_fptr_t *)ia64_monarch_init_handler; + ia64_fptr_t *slave_init_ptr = (ia64_fptr_t *)ia64_slave_init_handler; + ia64_fptr_t *mca_hldlr_ptr = (ia64_fptr_t *)ia64_os_mca_dispatch; + int i; + s64 rc; + struct ia64_sal_retval isrv; + u64 timeout = IA64_MCA_RENDEZ_TIMEOUT; /* platform specific */ + + IA64_MCA_DEBUG("%s: begin\n", __FUNCTION__); + + /* Clear the Rendez checkin flag for all cpus */ + for(i = 0 ; i < NR_CPUS; i++) + ia64_mc_info.imi_rendez_checkin[i] = IA64_MCA_RENDEZ_CHECKIN_NOTDONE; + + /* + * Register the rendezvous spinloop and wakeup mechanism with SAL + */ + + /* Register the rendezvous interrupt vector with SAL */ + while (1) { + isrv = ia64_sal_mc_set_params(SAL_MC_PARAM_RENDEZ_INT, + SAL_MC_PARAM_MECHANISM_INT, + IA64_MCA_RENDEZ_VECTOR, + timeout, + SAL_MC_PARAM_RZ_ALWAYS); + rc = isrv.status; + if (rc == 0) + break; + if (rc == -2) { + printk(KERN_INFO "Increasing MCA rendezvous timeout from " + "%ld to %ld milliseconds\n", timeout, isrv.v0); + timeout = isrv.v0; + continue; + } + printk(KERN_ERR "Failed to register rendezvous interrupt " + "with SAL (status %ld)\n", rc); + return; + } + + /* Register the wakeup interrupt vector with SAL */ + isrv = ia64_sal_mc_set_params(SAL_MC_PARAM_RENDEZ_WAKEUP, + SAL_MC_PARAM_MECHANISM_INT, + IA64_MCA_WAKEUP_VECTOR, + 0, 0); + rc = isrv.status; + if (rc) { + printk(KERN_ERR "Failed to register wakeup interrupt with SAL " + "(status %ld)\n", rc); + return; + } + + IA64_MCA_DEBUG("%s: registered MCA rendezvous spinloop and wakeup mech.\n", __FUNCTION__); + + ia64_mc_info.imi_mca_handler = ia64_tpa(mca_hldlr_ptr->fp); + /* + * XXX - disable SAL checksum by setting size to 0; should be + * ia64_tpa(ia64_os_mca_dispatch_end) - ia64_tpa(ia64_os_mca_dispatch); + */ + ia64_mc_info.imi_mca_handler_size = 0; + + /* Register the os mca handler with SAL */ + if ((rc = ia64_sal_set_vectors(SAL_VECTOR_OS_MCA, + ia64_mc_info.imi_mca_handler, + ia64_tpa(mca_hldlr_ptr->gp), + ia64_mc_info.imi_mca_handler_size, + 0, 0, 0))) + { + printk(KERN_ERR "Failed to register OS MCA handler with SAL " + "(status %ld)\n", rc); + return; + } + + IA64_MCA_DEBUG("%s: registered OS MCA handler with SAL at 0x%lx, gp = 0x%lx\n", __FUNCTION__, + ia64_mc_info.imi_mca_handler, ia64_tpa(mca_hldlr_ptr->gp)); + + /* + * XXX - disable SAL checksum by setting size to 0, should be + * size of the actual init handler in mca_asm.S. + */ + ia64_mc_info.imi_monarch_init_handler = ia64_tpa(mon_init_ptr->fp); + ia64_mc_info.imi_monarch_init_handler_size = 0; + ia64_mc_info.imi_slave_init_handler = ia64_tpa(slave_init_ptr->fp); + ia64_mc_info.imi_slave_init_handler_size = 0; + + IA64_MCA_DEBUG("%s: OS INIT handler at %lx\n", __FUNCTION__, + ia64_mc_info.imi_monarch_init_handler); + + /* Register the os init handler with SAL */ + if ((rc = ia64_sal_set_vectors(SAL_VECTOR_OS_INIT, + ia64_mc_info.imi_monarch_init_handler, + ia64_tpa(ia64_getreg(_IA64_REG_GP)), + ia64_mc_info.imi_monarch_init_handler_size, + ia64_mc_info.imi_slave_init_handler, + ia64_tpa(ia64_getreg(_IA64_REG_GP)), + ia64_mc_info.imi_slave_init_handler_size))) + { + printk(KERN_ERR "Failed to register m/s INIT handlers with SAL " + "(status %ld)\n", rc); + return; + } + + IA64_MCA_DEBUG("%s: registered OS INIT handler with SAL\n", __FUNCTION__); + + /* + * Configure the CMCI/P vector and handler. Interrupts for CMC are + * per-processor, so AP CMC interrupts are setup in smp_callin() (smpboot.c). + */ + register_percpu_irq(IA64_CMC_VECTOR, &cmci_irqaction); + register_percpu_irq(IA64_CMCP_VECTOR, &cmcp_irqaction); + ia64_mca_cmc_vector_setup(); /* Setup vector on BSP */ + + /* Setup the MCA rendezvous interrupt vector */ + register_percpu_irq(IA64_MCA_RENDEZ_VECTOR, &mca_rdzv_irqaction); + + /* Setup the MCA wakeup interrupt vector */ + register_percpu_irq(IA64_MCA_WAKEUP_VECTOR, &mca_wkup_irqaction); + +#ifdef CONFIG_ACPI + /* Setup the CPEI/P vector and handler */ + cpe_vector = acpi_request_vector(ACPI_INTERRUPT_CPEI); + register_percpu_irq(IA64_CPEP_VECTOR, &mca_cpep_irqaction); +#endif + + /* Initialize the areas set aside by the OS to buffer the + * platform/processor error states for MCA/INIT/CMC + * handling. + */ + ia64_log_init(SAL_INFO_TYPE_MCA); + ia64_log_init(SAL_INFO_TYPE_INIT); + ia64_log_init(SAL_INFO_TYPE_CMC); + ia64_log_init(SAL_INFO_TYPE_CPE); + + mca_init = 1; + printk(KERN_INFO "MCA related initialization done\n"); +} + +/* + * ia64_mca_late_init + * + * Opportunity to setup things that require initialization later + * than ia64_mca_init. Setup a timer to poll for CPEs if the + * platform doesn't support an interrupt driven mechanism. + * + * Inputs : None + * Outputs : Status + */ +static int __init +ia64_mca_late_init(void) +{ + if (!mca_init) + return 0; + + /* Setup the CMCI/P vector and handler */ + init_timer(&cmc_poll_timer); + cmc_poll_timer.function = ia64_mca_cmc_poll; + + /* Unmask/enable the vector */ + cmc_polling_enabled = 0; + schedule_work(&cmc_enable_work); + + IA64_MCA_DEBUG("%s: CMCI/P setup and enabled.\n", __FUNCTION__); + +#ifdef CONFIG_ACPI + /* Setup the CPEI/P vector and handler */ + init_timer(&cpe_poll_timer); + cpe_poll_timer.function = ia64_mca_cpe_poll; + + { + irq_desc_t *desc; + unsigned int irq; + + if (cpe_vector >= 0) { + /* If platform supports CPEI, enable the irq. */ + cpe_poll_enabled = 0; + for (irq = 0; irq < NR_IRQS; ++irq) + if (irq_to_vector(irq) == cpe_vector) { + desc = irq_descp(irq); + desc->status |= IRQ_PER_CPU; + setup_irq(irq, &mca_cpe_irqaction); + } + ia64_mca_register_cpev(cpe_vector); + IA64_MCA_DEBUG("%s: CPEI/P setup and enabled.\n", __FUNCTION__); + } else { + /* If platform doesn't support CPEI, get the timer going. */ + if (cpe_poll_enabled) { + ia64_mca_cpe_poll(0UL); + IA64_MCA_DEBUG("%s: CPEP setup and enabled.\n", __FUNCTION__); + } + } + } +#endif + + return 0; +} + +device_initcall(ia64_mca_late_init); diff --git a/arch/ia64/kernel/mca_asm.S b/arch/ia64/kernel/mca_asm.S new file mode 100644 index 000000000000..cf3f8014f9ad --- /dev/null +++ b/arch/ia64/kernel/mca_asm.S @@ -0,0 +1,928 @@ +// +// assembly portion of the IA64 MCA handling +// +// Mods by cfleck to integrate into kernel build +// 00/03/15 davidm Added various stop bits to get a clean compile +// +// 00/03/29 cfleck Added code to save INIT handoff state in pt_regs format, switch to temp +// kstack, switch modes, jump to C INIT handler +// +// 02/01/04 J.Hall <jenna.s.hall@intel.com> +// Before entering virtual mode code: +// 1. Check for TLB CPU error +// 2. Restore current thread pointer to kr6 +// 3. Move stack ptr 16 bytes to conform to C calling convention +// +// 04/11/12 Russ Anderson <rja@sgi.com> +// Added per cpu MCA/INIT stack save areas. +// +#include <linux/config.h> +#include <linux/threads.h> + +#include <asm/asmmacro.h> +#include <asm/pgtable.h> +#include <asm/processor.h> +#include <asm/mca_asm.h> +#include <asm/mca.h> + +/* + * When we get a machine check, the kernel stack pointer is no longer + * valid, so we need to set a new stack pointer. + */ +#define MINSTATE_PHYS /* Make sure stack access is physical for MINSTATE */ + +/* + * Needed for return context to SAL + */ +#define IA64_MCA_SAME_CONTEXT 0 +#define IA64_MCA_COLD_BOOT -2 + +#include "minstate.h" + +/* + * SAL_TO_OS_MCA_HANDOFF_STATE (SAL 3.0 spec) + * 1. GR1 = OS GP + * 2. GR8 = PAL_PROC physical address + * 3. GR9 = SAL_PROC physical address + * 4. GR10 = SAL GP (physical) + * 5. GR11 = Rendez state + * 6. GR12 = Return address to location within SAL_CHECK + */ +#define SAL_TO_OS_MCA_HANDOFF_STATE_SAVE(_tmp) \ + LOAD_PHYSICAL(p0, _tmp, ia64_sal_to_os_handoff_state);; \ + st8 [_tmp]=r1,0x08;; \ + st8 [_tmp]=r8,0x08;; \ + st8 [_tmp]=r9,0x08;; \ + st8 [_tmp]=r10,0x08;; \ + st8 [_tmp]=r11,0x08;; \ + st8 [_tmp]=r12,0x08;; \ + st8 [_tmp]=r17,0x08;; \ + st8 [_tmp]=r18,0x08 + +/* + * OS_MCA_TO_SAL_HANDOFF_STATE (SAL 3.0 spec) + * (p6) is executed if we never entered virtual mode (TLB error) + * (p7) is executed if we entered virtual mode as expected (normal case) + * 1. GR8 = OS_MCA return status + * 2. GR9 = SAL GP (physical) + * 3. GR10 = 0/1 returning same/new context + * 4. GR22 = New min state save area pointer + * returns ptr to SAL rtn save loc in _tmp + */ +#define OS_MCA_TO_SAL_HANDOFF_STATE_RESTORE(_tmp) \ + movl _tmp=ia64_os_to_sal_handoff_state;; \ + DATA_VA_TO_PA(_tmp);; \ + ld8 r8=[_tmp],0x08;; \ + ld8 r9=[_tmp],0x08;; \ + ld8 r10=[_tmp],0x08;; \ + ld8 r22=[_tmp],0x08;; + // now _tmp is pointing to SAL rtn save location + +/* + * COLD_BOOT_HANDOFF_STATE() sets ia64_mca_os_to_sal_state + * imots_os_status=IA64_MCA_COLD_BOOT + * imots_sal_gp=SAL GP + * imots_context=IA64_MCA_SAME_CONTEXT + * imots_new_min_state=Min state save area pointer + * imots_sal_check_ra=Return address to location within SAL_CHECK + * + */ +#define COLD_BOOT_HANDOFF_STATE(sal_to_os_handoff,os_to_sal_handoff,tmp)\ + movl tmp=IA64_MCA_COLD_BOOT; \ + movl sal_to_os_handoff=__pa(ia64_sal_to_os_handoff_state); \ + movl os_to_sal_handoff=__pa(ia64_os_to_sal_handoff_state);; \ + st8 [os_to_sal_handoff]=tmp,8;; \ + ld8 tmp=[sal_to_os_handoff],48;; \ + st8 [os_to_sal_handoff]=tmp,8;; \ + movl tmp=IA64_MCA_SAME_CONTEXT;; \ + st8 [os_to_sal_handoff]=tmp,8;; \ + ld8 tmp=[sal_to_os_handoff],-8;; \ + st8 [os_to_sal_handoff]=tmp,8;; \ + ld8 tmp=[sal_to_os_handoff];; \ + st8 [os_to_sal_handoff]=tmp;; + +#define GET_IA64_MCA_DATA(reg) \ + GET_THIS_PADDR(reg, ia64_mca_data) \ + ;; \ + ld8 reg=[reg] + + .global ia64_os_mca_dispatch + .global ia64_os_mca_dispatch_end + .global ia64_sal_to_os_handoff_state + .global ia64_os_to_sal_handoff_state + + .text + .align 16 + +ia64_os_mca_dispatch: + + // Serialize all MCA processing + mov r3=1;; + LOAD_PHYSICAL(p0,r2,ia64_mca_serialize);; +ia64_os_mca_spin: + xchg8 r4=[r2],r3;; + cmp.ne p6,p0=r4,r0 +(p6) br ia64_os_mca_spin + + // Save the SAL to OS MCA handoff state as defined + // by SAL SPEC 3.0 + // NOTE : The order in which the state gets saved + // is dependent on the way the C-structure + // for ia64_mca_sal_to_os_state_t has been + // defined in include/asm/mca.h + SAL_TO_OS_MCA_HANDOFF_STATE_SAVE(r2) + ;; + + // LOG PROCESSOR STATE INFO FROM HERE ON.. +begin_os_mca_dump: + br ia64_os_mca_proc_state_dump;; + +ia64_os_mca_done_dump: + + LOAD_PHYSICAL(p0,r16,ia64_sal_to_os_handoff_state+56) + ;; + ld8 r18=[r16] // Get processor state parameter on existing PALE_CHECK. + ;; + tbit.nz p6,p7=r18,60 +(p7) br.spnt done_tlb_purge_and_reload + + // The following code purges TC and TR entries. Then reload all TC entries. + // Purge percpu data TC entries. +begin_tlb_purge_and_reload: + +#define O(member) IA64_CPUINFO_##member##_OFFSET + + GET_THIS_PADDR(r2, cpu_info) // load phys addr of cpu_info into r2 + ;; + addl r17=O(PTCE_STRIDE),r2 + addl r2=O(PTCE_BASE),r2 + ;; + ld8 r18=[r2],(O(PTCE_COUNT)-O(PTCE_BASE));; // r18=ptce_base + ld4 r19=[r2],4 // r19=ptce_count[0] + ld4 r21=[r17],4 // r21=ptce_stride[0] + ;; + ld4 r20=[r2] // r20=ptce_count[1] + ld4 r22=[r17] // r22=ptce_stride[1] + mov r24=0 + ;; + adds r20=-1,r20 + ;; +#undef O + +2: + cmp.ltu p6,p7=r24,r19 +(p7) br.cond.dpnt.few 4f + mov ar.lc=r20 +3: + ptc.e r18 + ;; + add r18=r22,r18 + br.cloop.sptk.few 3b + ;; + add r18=r21,r18 + add r24=1,r24 + ;; + br.sptk.few 2b +4: + srlz.i // srlz.i implies srlz.d + ;; + + // Now purge addresses formerly mapped by TR registers + // 1. Purge ITR&DTR for kernel. + movl r16=KERNEL_START + mov r18=KERNEL_TR_PAGE_SHIFT<<2 + ;; + ptr.i r16, r18 + ptr.d r16, r18 + ;; + srlz.i + ;; + srlz.d + ;; + // 2. Purge DTR for PERCPU data. + movl r16=PERCPU_ADDR + mov r18=PERCPU_PAGE_SHIFT<<2 + ;; + ptr.d r16,r18 + ;; + srlz.d + ;; + // 3. Purge ITR for PAL code. + GET_THIS_PADDR(r2, ia64_mca_pal_base) + ;; + ld8 r16=[r2] + mov r18=IA64_GRANULE_SHIFT<<2 + ;; + ptr.i r16,r18 + ;; + srlz.i + ;; + // 4. Purge DTR for stack. + mov r16=IA64_KR(CURRENT_STACK) + ;; + shl r16=r16,IA64_GRANULE_SHIFT + movl r19=PAGE_OFFSET + ;; + add r16=r19,r16 + mov r18=IA64_GRANULE_SHIFT<<2 + ;; + ptr.d r16,r18 + ;; + srlz.i + ;; + // Finally reload the TR registers. + // 1. Reload DTR/ITR registers for kernel. + mov r18=KERNEL_TR_PAGE_SHIFT<<2 + movl r17=KERNEL_START + ;; + mov cr.itir=r18 + mov cr.ifa=r17 + mov r16=IA64_TR_KERNEL + mov r19=ip + movl r18=PAGE_KERNEL + ;; + dep r17=0,r19,0, KERNEL_TR_PAGE_SHIFT + ;; + or r18=r17,r18 + ;; + itr.i itr[r16]=r18 + ;; + itr.d dtr[r16]=r18 + ;; + srlz.i + srlz.d + ;; + // 2. Reload DTR register for PERCPU data. + GET_THIS_PADDR(r2, ia64_mca_per_cpu_pte) + ;; + movl r16=PERCPU_ADDR // vaddr + movl r18=PERCPU_PAGE_SHIFT<<2 + ;; + mov cr.itir=r18 + mov cr.ifa=r16 + ;; + ld8 r18=[r2] // load per-CPU PTE + mov r16=IA64_TR_PERCPU_DATA; + ;; + itr.d dtr[r16]=r18 + ;; + srlz.d + ;; + // 3. Reload ITR for PAL code. + GET_THIS_PADDR(r2, ia64_mca_pal_pte) + ;; + ld8 r18=[r2] // load PAL PTE + ;; + GET_THIS_PADDR(r2, ia64_mca_pal_base) + ;; + ld8 r16=[r2] // load PAL vaddr + mov r19=IA64_GRANULE_SHIFT<<2 + ;; + mov cr.itir=r19 + mov cr.ifa=r16 + mov r20=IA64_TR_PALCODE + ;; + itr.i itr[r20]=r18 + ;; + srlz.i + ;; + // 4. Reload DTR for stack. + mov r16=IA64_KR(CURRENT_STACK) + ;; + shl r16=r16,IA64_GRANULE_SHIFT + movl r19=PAGE_OFFSET + ;; + add r18=r19,r16 + movl r20=PAGE_KERNEL + ;; + add r16=r20,r16 + mov r19=IA64_GRANULE_SHIFT<<2 + ;; + mov cr.itir=r19 + mov cr.ifa=r18 + mov r20=IA64_TR_CURRENT_STACK + ;; + itr.d dtr[r20]=r16 + ;; + srlz.d + ;; + br.sptk.many done_tlb_purge_and_reload +err: + COLD_BOOT_HANDOFF_STATE(r20,r21,r22) + br.sptk.many ia64_os_mca_done_restore + +done_tlb_purge_and_reload: + + // Setup new stack frame for OS_MCA handling + GET_IA64_MCA_DATA(r2) + ;; + add r3 = IA64_MCA_CPU_STACKFRAME_OFFSET, r2 + add r2 = IA64_MCA_CPU_RBSTORE_OFFSET, r2 + ;; + rse_switch_context(r6,r3,r2);; // RSC management in this new context + + GET_IA64_MCA_DATA(r2) + ;; + add r2 = IA64_MCA_CPU_STACK_OFFSET+IA64_MCA_STACK_SIZE-16, r2 + ;; + mov r12=r2 // establish new stack-pointer + + // Enter virtual mode from physical mode + VIRTUAL_MODE_ENTER(r2, r3, ia64_os_mca_virtual_begin, r4) +ia64_os_mca_virtual_begin: + + // Call virtual mode handler + movl r2=ia64_mca_ucmc_handler;; + mov b6=r2;; + br.call.sptk.many b0=b6;; +.ret0: + // Revert back to physical mode before going back to SAL + PHYSICAL_MODE_ENTER(r2, r3, ia64_os_mca_virtual_end, r4) +ia64_os_mca_virtual_end: + + // restore the original stack frame here + GET_IA64_MCA_DATA(r2) + ;; + add r2 = IA64_MCA_CPU_STACKFRAME_OFFSET, r2 + ;; + movl r4=IA64_PSR_MC + ;; + rse_return_context(r4,r3,r2) // switch from interrupt context for RSE + + // let us restore all the registers from our PSI structure + mov r8=gp + ;; +begin_os_mca_restore: + br ia64_os_mca_proc_state_restore;; + +ia64_os_mca_done_restore: + OS_MCA_TO_SAL_HANDOFF_STATE_RESTORE(r2);; + // branch back to SALE_CHECK + ld8 r3=[r2];; + mov b0=r3;; // SAL_CHECK return address + + // release lock + movl r3=ia64_mca_serialize;; + DATA_VA_TO_PA(r3);; + st8.rel [r3]=r0 + + br b0 + ;; +ia64_os_mca_dispatch_end: +//EndMain////////////////////////////////////////////////////////////////////// + + +//++ +// Name: +// ia64_os_mca_proc_state_dump() +// +// Stub Description: +// +// This stub dumps the processor state during MCHK to a data area +// +//-- + +ia64_os_mca_proc_state_dump: +// Save bank 1 GRs 16-31 which will be used by c-language code when we switch +// to virtual addressing mode. + GET_IA64_MCA_DATA(r2) + ;; + add r2 = IA64_MCA_CPU_PROC_STATE_DUMP_OFFSET, r2 + ;; +// save ar.NaT + mov r5=ar.unat // ar.unat + +// save banked GRs 16-31 along with NaT bits + bsw.1;; + st8.spill [r2]=r16,8;; + st8.spill [r2]=r17,8;; + st8.spill [r2]=r18,8;; + st8.spill [r2]=r19,8;; + st8.spill [r2]=r20,8;; + st8.spill [r2]=r21,8;; + st8.spill [r2]=r22,8;; + st8.spill [r2]=r23,8;; + st8.spill [r2]=r24,8;; + st8.spill [r2]=r25,8;; + st8.spill [r2]=r26,8;; + st8.spill [r2]=r27,8;; + st8.spill [r2]=r28,8;; + st8.spill [r2]=r29,8;; + st8.spill [r2]=r30,8;; + st8.spill [r2]=r31,8;; + + mov r4=ar.unat;; + st8 [r2]=r4,8 // save User NaT bits for r16-r31 + mov ar.unat=r5 // restore original unat + bsw.0;; + +//save BRs + add r4=8,r2 // duplicate r2 in r4 + add r6=2*8,r2 // duplicate r2 in r4 + + mov r3=b0 + mov r5=b1 + mov r7=b2;; + st8 [r2]=r3,3*8 + st8 [r4]=r5,3*8 + st8 [r6]=r7,3*8;; + + mov r3=b3 + mov r5=b4 + mov r7=b5;; + st8 [r2]=r3,3*8 + st8 [r4]=r5,3*8 + st8 [r6]=r7,3*8;; + + mov r3=b6 + mov r5=b7;; + st8 [r2]=r3,2*8 + st8 [r4]=r5,2*8;; + +cSaveCRs: +// save CRs + add r4=8,r2 // duplicate r2 in r4 + add r6=2*8,r2 // duplicate r2 in r4 + + mov r3=cr.dcr + mov r5=cr.itm + mov r7=cr.iva;; + + st8 [r2]=r3,8*8 + st8 [r4]=r5,3*8 + st8 [r6]=r7,3*8;; // 48 byte rements + + mov r3=cr.pta;; + st8 [r2]=r3,8*8;; // 64 byte rements + +// if PSR.ic=0, reading interruption registers causes an illegal operation fault + mov r3=psr;; + tbit.nz.unc p6,p0=r3,PSR_IC;; // PSI Valid Log bit pos. test +(p6) st8 [r2]=r0,9*8+160 // increment by 232 byte inc. +begin_skip_intr_regs: +(p6) br SkipIntrRegs;; + + add r4=8,r2 // duplicate r2 in r4 + add r6=2*8,r2 // duplicate r2 in r6 + + mov r3=cr.ipsr + mov r5=cr.isr + mov r7=r0;; + st8 [r2]=r3,3*8 + st8 [r4]=r5,3*8 + st8 [r6]=r7,3*8;; + + mov r3=cr.iip + mov r5=cr.ifa + mov r7=cr.itir;; + st8 [r2]=r3,3*8 + st8 [r4]=r5,3*8 + st8 [r6]=r7,3*8;; + + mov r3=cr.iipa + mov r5=cr.ifs + mov r7=cr.iim;; + st8 [r2]=r3,3*8 + st8 [r4]=r5,3*8 + st8 [r6]=r7,3*8;; + + mov r3=cr25;; // cr.iha + st8 [r2]=r3,160;; // 160 byte rement + +SkipIntrRegs: + st8 [r2]=r0,152;; // another 152 byte . + + add r4=8,r2 // duplicate r2 in r4 + add r6=2*8,r2 // duplicate r2 in r6 + + mov r3=cr.lid +// mov r5=cr.ivr // cr.ivr, don't read it + mov r7=cr.tpr;; + st8 [r2]=r3,3*8 + st8 [r4]=r5,3*8 + st8 [r6]=r7,3*8;; + + mov r3=r0 // cr.eoi => cr67 + mov r5=r0 // cr.irr0 => cr68 + mov r7=r0;; // cr.irr1 => cr69 + st8 [r2]=r3,3*8 + st8 [r4]=r5,3*8 + st8 [r6]=r7,3*8;; + + mov r3=r0 // cr.irr2 => cr70 + mov r5=r0 // cr.irr3 => cr71 + mov r7=cr.itv;; + st8 [r2]=r3,3*8 + st8 [r4]=r5,3*8 + st8 [r6]=r7,3*8;; + + mov r3=cr.pmv + mov r5=cr.cmcv;; + st8 [r2]=r3,7*8 + st8 [r4]=r5,7*8;; + + mov r3=r0 // cr.lrr0 => cr80 + mov r5=r0;; // cr.lrr1 => cr81 + st8 [r2]=r3,23*8 + st8 [r4]=r5,23*8;; + + adds r2=25*8,r2;; + +cSaveARs: +// save ARs + add r4=8,r2 // duplicate r2 in r4 + add r6=2*8,r2 // duplicate r2 in r6 + + mov r3=ar.k0 + mov r5=ar.k1 + mov r7=ar.k2;; + st8 [r2]=r3,3*8 + st8 [r4]=r5,3*8 + st8 [r6]=r7,3*8;; + + mov r3=ar.k3 + mov r5=ar.k4 + mov r7=ar.k5;; + st8 [r2]=r3,3*8 + st8 [r4]=r5,3*8 + st8 [r6]=r7,3*8;; + + mov r3=ar.k6 + mov r5=ar.k7 + mov r7=r0;; // ar.kr8 + st8 [r2]=r3,10*8 + st8 [r4]=r5,10*8 + st8 [r6]=r7,10*8;; // rement by 72 bytes + + mov r3=ar.rsc + mov ar.rsc=r0 // put RSE in enforced lazy mode + mov r5=ar.bsp + ;; + mov r7=ar.bspstore;; + st8 [r2]=r3,3*8 + st8 [r4]=r5,3*8 + st8 [r6]=r7,3*8;; + + mov r3=ar.rnat;; + st8 [r2]=r3,8*13 // increment by 13x8 bytes + + mov r3=ar.ccv;; + st8 [r2]=r3,8*4 + + mov r3=ar.unat;; + st8 [r2]=r3,8*4 + + mov r3=ar.fpsr;; + st8 [r2]=r3,8*4 + + mov r3=ar.itc;; + st8 [r2]=r3,160 // 160 + + mov r3=ar.pfs;; + st8 [r2]=r3,8 + + mov r3=ar.lc;; + st8 [r2]=r3,8 + + mov r3=ar.ec;; + st8 [r2]=r3 + add r2=8*62,r2 //padding + +// save RRs + mov ar.lc=0x08-1 + movl r4=0x00;; + +cStRR: + dep.z r5=r4,61,3;; + mov r3=rr[r5];; + st8 [r2]=r3,8 + add r4=1,r4 + br.cloop.sptk.few cStRR + ;; +end_os_mca_dump: + br ia64_os_mca_done_dump;; + +//EndStub////////////////////////////////////////////////////////////////////// + + +//++ +// Name: +// ia64_os_mca_proc_state_restore() +// +// Stub Description: +// +// This is a stub to restore the saved processor state during MCHK +// +//-- + +ia64_os_mca_proc_state_restore: + +// Restore bank1 GR16-31 + GET_IA64_MCA_DATA(r2) + ;; + add r2 = IA64_MCA_CPU_PROC_STATE_DUMP_OFFSET, r2 + +restore_GRs: // restore bank-1 GRs 16-31 + bsw.1;; + add r3=16*8,r2;; // to get to NaT of GR 16-31 + ld8 r3=[r3];; + mov ar.unat=r3;; // first restore NaT + + ld8.fill r16=[r2],8;; + ld8.fill r17=[r2],8;; + ld8.fill r18=[r2],8;; + ld8.fill r19=[r2],8;; + ld8.fill r20=[r2],8;; + ld8.fill r21=[r2],8;; + ld8.fill r22=[r2],8;; + ld8.fill r23=[r2],8;; + ld8.fill r24=[r2],8;; + ld8.fill r25=[r2],8;; + ld8.fill r26=[r2],8;; + ld8.fill r27=[r2],8;; + ld8.fill r28=[r2],8;; + ld8.fill r29=[r2],8;; + ld8.fill r30=[r2],8;; + ld8.fill r31=[r2],8;; + + ld8 r3=[r2],8;; // increment to skip NaT + bsw.0;; + +restore_BRs: + add r4=8,r2 // duplicate r2 in r4 + add r6=2*8,r2;; // duplicate r2 in r4 + + ld8 r3=[r2],3*8 + ld8 r5=[r4],3*8 + ld8 r7=[r6],3*8;; + mov b0=r3 + mov b1=r5 + mov b2=r7;; + + ld8 r3=[r2],3*8 + ld8 r5=[r4],3*8 + ld8 r7=[r6],3*8;; + mov b3=r3 + mov b4=r5 + mov b5=r7;; + + ld8 r3=[r2],2*8 + ld8 r5=[r4],2*8;; + mov b6=r3 + mov b7=r5;; + +restore_CRs: + add r4=8,r2 // duplicate r2 in r4 + add r6=2*8,r2;; // duplicate r2 in r4 + + ld8 r3=[r2],8*8 + ld8 r5=[r4],3*8 + ld8 r7=[r6],3*8;; // 48 byte increments + mov cr.dcr=r3 + mov cr.itm=r5 + mov cr.iva=r7;; + + ld8 r3=[r2],8*8;; // 64 byte increments +// mov cr.pta=r3 + + +// if PSR.ic=1, reading interruption registers causes an illegal operation fault + mov r3=psr;; + tbit.nz.unc p6,p0=r3,PSR_IC;; // PSI Valid Log bit pos. test +(p6) st8 [r2]=r0,9*8+160 // increment by 232 byte inc. + +begin_rskip_intr_regs: +(p6) br rSkipIntrRegs;; + + add r4=8,r2 // duplicate r2 in r4 + add r6=2*8,r2;; // duplicate r2 in r4 + + ld8 r3=[r2],3*8 + ld8 r5=[r4],3*8 + ld8 r7=[r6],3*8;; + mov cr.ipsr=r3 +// mov cr.isr=r5 // cr.isr is read only + + ld8 r3=[r2],3*8 + ld8 r5=[r4],3*8 + ld8 r7=[r6],3*8;; + mov cr.iip=r3 + mov cr.ifa=r5 + mov cr.itir=r7;; + + ld8 r3=[r2],3*8 + ld8 r5=[r4],3*8 + ld8 r7=[r6],3*8;; + mov cr.iipa=r3 + mov cr.ifs=r5 + mov cr.iim=r7 + + ld8 r3=[r2],160;; // 160 byte increment + mov cr.iha=r3 + +rSkipIntrRegs: + ld8 r3=[r2],152;; // another 152 byte inc. + + add r4=8,r2 // duplicate r2 in r4 + add r6=2*8,r2;; // duplicate r2 in r6 + + ld8 r3=[r2],8*3 + ld8 r5=[r4],8*3 + ld8 r7=[r6],8*3;; + mov cr.lid=r3 +// mov cr.ivr=r5 // cr.ivr is read only + mov cr.tpr=r7;; + + ld8 r3=[r2],8*3 + ld8 r5=[r4],8*3 + ld8 r7=[r6],8*3;; +// mov cr.eoi=r3 +// mov cr.irr0=r5 // cr.irr0 is read only +// mov cr.irr1=r7;; // cr.irr1 is read only + + ld8 r3=[r2],8*3 + ld8 r5=[r4],8*3 + ld8 r7=[r6],8*3;; +// mov cr.irr2=r3 // cr.irr2 is read only +// mov cr.irr3=r5 // cr.irr3 is read only + mov cr.itv=r7;; + + ld8 r3=[r2],8*7 + ld8 r5=[r4],8*7;; + mov cr.pmv=r3 + mov cr.cmcv=r5;; + + ld8 r3=[r2],8*23 + ld8 r5=[r4],8*23;; + adds r2=8*23,r2 + adds r4=8*23,r4;; +// mov cr.lrr0=r3 +// mov cr.lrr1=r5 + + adds r2=8*2,r2;; + +restore_ARs: + add r4=8,r2 // duplicate r2 in r4 + add r6=2*8,r2;; // duplicate r2 in r4 + + ld8 r3=[r2],3*8 + ld8 r5=[r4],3*8 + ld8 r7=[r6],3*8;; + mov ar.k0=r3 + mov ar.k1=r5 + mov ar.k2=r7;; + + ld8 r3=[r2],3*8 + ld8 r5=[r4],3*8 + ld8 r7=[r6],3*8;; + mov ar.k3=r3 + mov ar.k4=r5 + mov ar.k5=r7;; + + ld8 r3=[r2],10*8 + ld8 r5=[r4],10*8 + ld8 r7=[r6],10*8;; + mov ar.k6=r3 + mov ar.k7=r5 + ;; + + ld8 r3=[r2],3*8 + ld8 r5=[r4],3*8 + ld8 r7=[r6],3*8;; +// mov ar.rsc=r3 +// mov ar.bsp=r5 // ar.bsp is read only + mov ar.rsc=r0 // make sure that RSE is in enforced lazy mode + ;; + mov ar.bspstore=r7;; + + ld8 r9=[r2],8*13;; + mov ar.rnat=r9 + + mov ar.rsc=r3 + ld8 r3=[r2],8*4;; + mov ar.ccv=r3 + + ld8 r3=[r2],8*4;; + mov ar.unat=r3 + + ld8 r3=[r2],8*4;; + mov ar.fpsr=r3 + + ld8 r3=[r2],160;; // 160 +// mov ar.itc=r3 + + ld8 r3=[r2],8;; + mov ar.pfs=r3 + + ld8 r3=[r2],8;; + mov ar.lc=r3 + + ld8 r3=[r2];; + mov ar.ec=r3 + add r2=8*62,r2;; // padding + +restore_RRs: + mov r5=ar.lc + mov ar.lc=0x08-1 + movl r4=0x00;; +cStRRr: + dep.z r7=r4,61,3 + ld8 r3=[r2],8;; + mov rr[r7]=r3 // what are its access previledges? + add r4=1,r4 + br.cloop.sptk.few cStRRr + ;; + mov ar.lc=r5 + ;; +end_os_mca_restore: + br ia64_os_mca_done_restore;; + +//EndStub////////////////////////////////////////////////////////////////////// + + +// ok, the issue here is that we need to save state information so +// it can be useable by the kernel debugger and show regs routines. +// In order to do this, our best bet is save the current state (plus +// the state information obtain from the MIN_STATE_AREA) into a pt_regs +// format. This way we can pass it on in a useable format. +// + +// +// SAL to OS entry point for INIT on the monarch processor +// This has been defined for registration purposes with SAL +// as a part of ia64_mca_init. +// +// When we get here, the following registers have been +// set by the SAL for our use +// +// 1. GR1 = OS INIT GP +// 2. GR8 = PAL_PROC physical address +// 3. GR9 = SAL_PROC physical address +// 4. GR10 = SAL GP (physical) +// 5. GR11 = Init Reason +// 0 = Received INIT for event other than crash dump switch +// 1 = Received wakeup at the end of an OS_MCA corrected machine check +// 2 = Received INIT dude to CrashDump switch assertion +// +// 6. GR12 = Return address to location within SAL_INIT procedure + + +GLOBAL_ENTRY(ia64_monarch_init_handler) + .prologue + // stash the information the SAL passed to os + SAL_TO_OS_MCA_HANDOFF_STATE_SAVE(r2) + ;; + SAVE_MIN_WITH_COVER + ;; + mov r8=cr.ifa + mov r9=cr.isr + adds r3=8,r2 // set up second base pointer + ;; + SAVE_REST + +// ok, enough should be saved at this point to be dangerous, and supply +// information for a dump +// We need to switch to Virtual mode before hitting the C functions. + + movl r2=IA64_PSR_IT|IA64_PSR_IC|IA64_PSR_DT|IA64_PSR_RT|IA64_PSR_DFH|IA64_PSR_BN + mov r3=psr // get the current psr, minimum enabled at this point + ;; + or r2=r2,r3 + ;; + movl r3=IVirtual_Switch + ;; + mov cr.iip=r3 // short return to set the appropriate bits + mov cr.ipsr=r2 // need to do an rfi to set appropriate bits + ;; + rfi + ;; +IVirtual_Switch: + // + // We should now be running virtual + // + // Let's call the C handler to get the rest of the state info + // + alloc r14=ar.pfs,0,0,2,0 // now it's safe (must be first in insn group!) + ;; + adds out0=16,sp // out0 = pointer to pt_regs + ;; + DO_SAVE_SWITCH_STACK + .body + adds out1=16,sp // out0 = pointer to switch_stack + + br.call.sptk.many rp=ia64_init_handler +.ret1: + +return_from_init: + br.sptk return_from_init +END(ia64_monarch_init_handler) + +// +// SAL to OS entry point for INIT on the slave processor +// This has been defined for registration purposes with SAL +// as a part of ia64_mca_init. +// + +GLOBAL_ENTRY(ia64_slave_init_handler) +1: br.sptk 1b +END(ia64_slave_init_handler) diff --git a/arch/ia64/kernel/mca_drv.c b/arch/ia64/kernel/mca_drv.c new file mode 100644 index 000000000000..ab478172c349 --- /dev/null +++ b/arch/ia64/kernel/mca_drv.c @@ -0,0 +1,639 @@ +/* + * File: mca_drv.c + * Purpose: Generic MCA handling layer + * + * Copyright (C) 2004 FUJITSU LIMITED + * Copyright (C) Hidetoshi Seto (seto.hidetoshi@jp.fujitsu.com) + */ +#include <linux/config.h> +#include <linux/types.h> +#include <linux/init.h> +#include <linux/sched.h> +#include <linux/interrupt.h> +#include <linux/irq.h> +#include <linux/kallsyms.h> +#include <linux/smp_lock.h> +#include <linux/bootmem.h> +#include <linux/acpi.h> +#include <linux/timer.h> +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/smp.h> +#include <linux/workqueue.h> +#include <linux/mm.h> + +#include <asm/delay.h> +#include <asm/machvec.h> +#include <asm/page.h> +#include <asm/ptrace.h> +#include <asm/system.h> +#include <asm/sal.h> +#include <asm/mca.h> + +#include <asm/irq.h> +#include <asm/hw_irq.h> + +#include "mca_drv.h" + +/* max size of SAL error record (default) */ +static int sal_rec_max = 10000; + +/* from mca.c */ +static ia64_mca_sal_to_os_state_t *sal_to_os_handoff_state; +static ia64_mca_os_to_sal_state_t *os_to_sal_handoff_state; + +/* from mca_drv_asm.S */ +extern void *mca_handler_bhhook(void); + +static DEFINE_SPINLOCK(mca_bh_lock); + +typedef enum { + MCA_IS_LOCAL = 0, + MCA_IS_GLOBAL = 1 +} mca_type_t; + +#define MAX_PAGE_ISOLATE 1024 + +static struct page *page_isolate[MAX_PAGE_ISOLATE]; +static int num_page_isolate = 0; + +typedef enum { + ISOLATE_NG = 0, + ISOLATE_OK = 1 +} isolate_status_t; + +/* + * This pool keeps pointers to the section part of SAL error record + */ +static struct { + slidx_list_t *buffer; /* section pointer list pool */ + int cur_idx; /* Current index of section pointer list pool */ + int max_idx; /* Maximum index of section pointer list pool */ +} slidx_pool; + +/** + * mca_page_isolate - isolate a poisoned page in order not to use it later + * @paddr: poisoned memory location + * + * Return value: + * ISOLATE_OK / ISOLATE_NG + */ + +static isolate_status_t +mca_page_isolate(unsigned long paddr) +{ + int i; + struct page *p; + + /* whether physical address is valid or not */ + if ( !ia64_phys_addr_valid(paddr) ) + return ISOLATE_NG; + + /* convert physical address to physical page number */ + p = pfn_to_page(paddr>>PAGE_SHIFT); + + /* check whether a page number have been already registered or not */ + for( i = 0; i < num_page_isolate; i++ ) + if( page_isolate[i] == p ) + return ISOLATE_OK; /* already listed */ + + /* limitation check */ + if( num_page_isolate == MAX_PAGE_ISOLATE ) + return ISOLATE_NG; + + /* kick pages having attribute 'SLAB' or 'Reserved' */ + if( PageSlab(p) || PageReserved(p) ) + return ISOLATE_NG; + + /* add attribute 'Reserved' and register the page */ + SetPageReserved(p); + page_isolate[num_page_isolate++] = p; + + return ISOLATE_OK; +} + +/** + * mca_hanlder_bh - Kill the process which occurred memory read error + * @paddr: poisoned address received from MCA Handler + */ + +void +mca_handler_bh(unsigned long paddr) +{ + printk(KERN_DEBUG "OS_MCA: process [pid: %d](%s) encounters MCA.\n", + current->pid, current->comm); + + spin_lock(&mca_bh_lock); + if (mca_page_isolate(paddr) == ISOLATE_OK) { + printk(KERN_DEBUG "Page isolation: ( %lx ) success.\n", paddr); + } else { + printk(KERN_DEBUG "Page isolation: ( %lx ) failure.\n", paddr); + } + spin_unlock(&mca_bh_lock); + + /* This process is about to be killed itself */ + force_sig(SIGKILL, current); + schedule(); +} + +/** + * mca_make_peidx - Make index of processor error section + * @slpi: pointer to record of processor error section + * @peidx: pointer to index of processor error section + */ + +static void +mca_make_peidx(sal_log_processor_info_t *slpi, peidx_table_t *peidx) +{ + /* + * calculate the start address of + * "struct cpuid_info" and "sal_processor_static_info_t". + */ + u64 total_check_num = slpi->valid.num_cache_check + + slpi->valid.num_tlb_check + + slpi->valid.num_bus_check + + slpi->valid.num_reg_file_check + + slpi->valid.num_ms_check; + u64 head_size = sizeof(sal_log_mod_error_info_t) * total_check_num + + sizeof(sal_log_processor_info_t); + u64 mid_size = slpi->valid.cpuid_info * sizeof(struct sal_cpuid_info); + + peidx_head(peidx) = slpi; + peidx_mid(peidx) = (struct sal_cpuid_info *) + (slpi->valid.cpuid_info ? ((char*)slpi + head_size) : NULL); + peidx_bottom(peidx) = (sal_processor_static_info_t *) + (slpi->valid.psi_static_struct ? + ((char*)slpi + head_size + mid_size) : NULL); +} + +/** + * mca_make_slidx - Make index of SAL error record + * @buffer: pointer to SAL error record + * @slidx: pointer to index of SAL error record + * + * Return value: + * 1 if record has platform error / 0 if not + */ +#define LOG_INDEX_ADD_SECT_PTR(sect, ptr) \ + { slidx_list_t *hl = &slidx_pool.buffer[slidx_pool.cur_idx]; \ + hl->hdr = ptr; \ + list_add(&hl->list, &(sect)); \ + slidx_pool.cur_idx = (slidx_pool.cur_idx + 1)%slidx_pool.max_idx; } + +static int +mca_make_slidx(void *buffer, slidx_table_t *slidx) +{ + int platform_err = 0; + int record_len = ((sal_log_record_header_t*)buffer)->len; + u32 ercd_pos; + int sects; + sal_log_section_hdr_t *sp; + + /* + * Initialize index referring current record + */ + INIT_LIST_HEAD(&(slidx->proc_err)); + INIT_LIST_HEAD(&(slidx->mem_dev_err)); + INIT_LIST_HEAD(&(slidx->sel_dev_err)); + INIT_LIST_HEAD(&(slidx->pci_bus_err)); + INIT_LIST_HEAD(&(slidx->smbios_dev_err)); + INIT_LIST_HEAD(&(slidx->pci_comp_err)); + INIT_LIST_HEAD(&(slidx->plat_specific_err)); + INIT_LIST_HEAD(&(slidx->host_ctlr_err)); + INIT_LIST_HEAD(&(slidx->plat_bus_err)); + INIT_LIST_HEAD(&(slidx->unsupported)); + + /* + * Extract a Record Header + */ + slidx->header = buffer; + + /* + * Extract each section records + * (arranged from "int ia64_log_platform_info_print()") + */ + for (ercd_pos = sizeof(sal_log_record_header_t), sects = 0; + ercd_pos < record_len; ercd_pos += sp->len, sects++) { + sp = (sal_log_section_hdr_t *)((char*)buffer + ercd_pos); + if (!efi_guidcmp(sp->guid, SAL_PROC_DEV_ERR_SECT_GUID)) { + LOG_INDEX_ADD_SECT_PTR(slidx->proc_err, sp); + } else if (!efi_guidcmp(sp->guid, SAL_PLAT_MEM_DEV_ERR_SECT_GUID)) { + platform_err = 1; + LOG_INDEX_ADD_SECT_PTR(slidx->mem_dev_err, sp); + } else if (!efi_guidcmp(sp->guid, SAL_PLAT_SEL_DEV_ERR_SECT_GUID)) { + platform_err = 1; + LOG_INDEX_ADD_SECT_PTR(slidx->sel_dev_err, sp); + } else if (!efi_guidcmp(sp->guid, SAL_PLAT_PCI_BUS_ERR_SECT_GUID)) { + platform_err = 1; + LOG_INDEX_ADD_SECT_PTR(slidx->pci_bus_err, sp); + } else if (!efi_guidcmp(sp->guid, SAL_PLAT_SMBIOS_DEV_ERR_SECT_GUID)) { + platform_err = 1; + LOG_INDEX_ADD_SECT_PTR(slidx->smbios_dev_err, sp); + } else if (!efi_guidcmp(sp->guid, SAL_PLAT_PCI_COMP_ERR_SECT_GUID)) { + platform_err = 1; + LOG_INDEX_ADD_SECT_PTR(slidx->pci_comp_err, sp); + } else if (!efi_guidcmp(sp->guid, SAL_PLAT_SPECIFIC_ERR_SECT_GUID)) { + platform_err = 1; + LOG_INDEX_ADD_SECT_PTR(slidx->plat_specific_err, sp); + } else if (!efi_guidcmp(sp->guid, SAL_PLAT_HOST_CTLR_ERR_SECT_GUID)) { + platform_err = 1; + LOG_INDEX_ADD_SECT_PTR(slidx->host_ctlr_err, sp); + } else if (!efi_guidcmp(sp->guid, SAL_PLAT_BUS_ERR_SECT_GUID)) { + platform_err = 1; + LOG_INDEX_ADD_SECT_PTR(slidx->plat_bus_err, sp); + } else { + LOG_INDEX_ADD_SECT_PTR(slidx->unsupported, sp); + } + } + slidx->n_sections = sects; + + return platform_err; +} + +/** + * init_record_index_pools - Initialize pool of lists for SAL record index + * + * Return value: + * 0 on Success / -ENOMEM on Failure + */ +static int +init_record_index_pools(void) +{ + int i; + int rec_max_size; /* Maximum size of SAL error records */ + int sect_min_size; /* Minimum size of SAL error sections */ + /* minimum size table of each section */ + static int sal_log_sect_min_sizes[] = { + sizeof(sal_log_processor_info_t) + sizeof(sal_processor_static_info_t), + sizeof(sal_log_mem_dev_err_info_t), + sizeof(sal_log_sel_dev_err_info_t), + sizeof(sal_log_pci_bus_err_info_t), + sizeof(sal_log_smbios_dev_err_info_t), + sizeof(sal_log_pci_comp_err_info_t), + sizeof(sal_log_plat_specific_err_info_t), + sizeof(sal_log_host_ctlr_err_info_t), + sizeof(sal_log_plat_bus_err_info_t), + }; + + /* + * MCA handler cannot allocate new memory on flight, + * so we preallocate enough memory to handle a SAL record. + * + * Initialize a handling set of slidx_pool: + * 1. Pick up the max size of SAL error records + * 2. Pick up the min size of SAL error sections + * 3. Allocate the pool as enough to 2 SAL records + * (now we can estimate the maxinum of section in a record.) + */ + + /* - 1 - */ + rec_max_size = sal_rec_max; + + /* - 2 - */ + sect_min_size = sal_log_sect_min_sizes[0]; + for (i = 1; i < sizeof sal_log_sect_min_sizes/sizeof(size_t); i++) + if (sect_min_size > sal_log_sect_min_sizes[i]) + sect_min_size = sal_log_sect_min_sizes[i]; + + /* - 3 - */ + slidx_pool.max_idx = (rec_max_size/sect_min_size) * 2 + 1; + slidx_pool.buffer = (slidx_list_t *) kmalloc(slidx_pool.max_idx * sizeof(slidx_list_t), GFP_KERNEL); + + return slidx_pool.buffer ? 0 : -ENOMEM; +} + + +/***************************************************************************** + * Recovery functions * + *****************************************************************************/ + +/** + * is_mca_global - Check whether this MCA is global or not + * @peidx: pointer of index of processor error section + * @pbci: pointer to pal_bus_check_info_t + * + * Return value: + * MCA_IS_LOCAL / MCA_IS_GLOBAL + */ + +static mca_type_t +is_mca_global(peidx_table_t *peidx, pal_bus_check_info_t *pbci) +{ + pal_processor_state_info_t *psp = (pal_processor_state_info_t*)peidx_psp(peidx); + + /* + * PAL can request a rendezvous, if the MCA has a global scope. + * If "rz_always" flag is set, SAL requests MCA rendezvous + * in spite of global MCA. + * Therefore it is local MCA when rendezvous has not been requested. + * Failed to rendezvous, the system must be down. + */ + switch (sal_to_os_handoff_state->imsto_rendez_state) { + case -1: /* SAL rendezvous unsuccessful */ + return MCA_IS_GLOBAL; + case 0: /* SAL rendezvous not required */ + return MCA_IS_LOCAL; + case 1: /* SAL rendezvous successful int */ + case 2: /* SAL rendezvous successful int with init */ + default: + break; + } + + /* + * If One or more Cache/TLB/Reg_File/Uarch_Check is here, + * it would be a local MCA. (i.e. processor internal error) + */ + if (psp->tc || psp->cc || psp->rc || psp->uc) + return MCA_IS_LOCAL; + + /* + * Bus_Check structure with Bus_Check.ib (internal bus error) flag set + * would be a global MCA. (e.g. a system bus address parity error) + */ + if (!pbci || pbci->ib) + return MCA_IS_GLOBAL; + + /* + * Bus_Check structure with Bus_Check.eb (external bus error) flag set + * could be either a local MCA or a global MCA. + * + * Referring Bus_Check.bsi: + * 0: Unknown/unclassified + * 1: BERR# + * 2: BINIT# + * 3: Hard Fail + * (FIXME: Are these SGI specific or generic bsi values?) + */ + if (pbci->eb) + switch (pbci->bsi) { + case 0: + /* e.g. a load from poisoned memory */ + return MCA_IS_LOCAL; + case 1: + case 2: + case 3: + return MCA_IS_GLOBAL; + } + + return MCA_IS_GLOBAL; +} + +/** + * recover_from_read_error - Try to recover the errors which type are "read"s. + * @slidx: pointer of index of SAL error record + * @peidx: pointer of index of processor error section + * @pbci: pointer of pal_bus_check_info + * + * Return value: + * 1 on Success / 0 on Failure + */ + +static int +recover_from_read_error(slidx_table_t *slidx, peidx_table_t *peidx, pal_bus_check_info_t *pbci) +{ + sal_log_mod_error_info_t *smei; + pal_min_state_area_t *pmsa; + struct ia64_psr *psr1, *psr2; + ia64_fptr_t *mca_hdlr_bh = (ia64_fptr_t*)mca_handler_bhhook; + + /* Is target address valid? */ + if (!pbci->tv) + return 0; + + /* + * cpu read or memory-mapped io read + * + * offending process affected process OS MCA do + * kernel mode kernel mode down system + * kernel mode user mode kill the process + * user mode kernel mode down system (*) + * user mode user mode kill the process + * + * (*) You could terminate offending user-mode process + * if (pbci->pv && pbci->pl != 0) *and* if you sure + * the process not have any locks of kernel. + */ + + psr1 =(struct ia64_psr *)&(peidx_minstate_area(peidx)->pmsa_ipsr); + + /* + * Check the privilege level of interrupted context. + * If it is user-mode, then terminate affected process. + */ + if (psr1->cpl != 0) { + smei = peidx_bus_check(peidx, 0); + if (smei->valid.target_identifier) { + /* + * setup for resume to bottom half of MCA, + * "mca_handler_bhhook" + */ + pmsa = (pal_min_state_area_t *)(sal_to_os_handoff_state->pal_min_state | (6ul<<61)); + /* pass to bhhook as 1st argument (gr8) */ + pmsa->pmsa_gr[8-1] = smei->target_identifier; + /* set interrupted return address (but no use) */ + pmsa->pmsa_br0 = pmsa->pmsa_iip; + /* change resume address to bottom half */ + pmsa->pmsa_iip = mca_hdlr_bh->fp; + pmsa->pmsa_gr[1-1] = mca_hdlr_bh->gp; + /* set cpl with kernel mode */ + psr2 = (struct ia64_psr *)&pmsa->pmsa_ipsr; + psr2->cpl = 0; + psr2->ri = 0; + + return 1; + } + + } + + return 0; +} + +/** + * recover_from_platform_error - Recover from platform error. + * @slidx: pointer of index of SAL error record + * @peidx: pointer of index of processor error section + * @pbci: pointer of pal_bus_check_info + * + * Return value: + * 1 on Success / 0 on Failure + */ + +static int +recover_from_platform_error(slidx_table_t *slidx, peidx_table_t *peidx, pal_bus_check_info_t *pbci) +{ + int status = 0; + pal_processor_state_info_t *psp = (pal_processor_state_info_t*)peidx_psp(peidx); + + if (psp->bc && pbci->eb && pbci->bsi == 0) { + switch(pbci->type) { + case 1: /* partial read */ + case 3: /* full line(cpu) read */ + case 9: /* I/O space read */ + status = recover_from_read_error(slidx, peidx, pbci); + break; + case 0: /* unknown */ + case 2: /* partial write */ + case 4: /* full line write */ + case 5: /* implicit or explicit write-back operation */ + case 6: /* snoop probe */ + case 7: /* incoming or outgoing ptc.g */ + case 8: /* write coalescing transactions */ + case 10: /* I/O space write */ + case 11: /* inter-processor interrupt message(IPI) */ + case 12: /* interrupt acknowledge or external task priority cycle */ + default: + break; + } + } + + return status; +} + +/** + * recover_from_processor_error + * @platform: whether there are some platform error section or not + * @slidx: pointer of index of SAL error record + * @peidx: pointer of index of processor error section + * @pbci: pointer of pal_bus_check_info + * + * Return value: + * 1 on Success / 0 on Failure + */ +/* + * Later we try to recover when below all conditions are satisfied. + * 1. Only one processor error section is exist. + * 2. BUS_CHECK is exist and the others are not exist.(Except TLB_CHECK) + * 3. The entry of BUS_CHECK_INFO is 1. + * 4. "External bus error" flag is set and the others are not set. + */ + +static int +recover_from_processor_error(int platform, slidx_table_t *slidx, peidx_table_t *peidx, pal_bus_check_info_t *pbci) +{ + pal_processor_state_info_t *psp = (pal_processor_state_info_t*)peidx_psp(peidx); + + /* + * We cannot recover errors with other than bus_check. + */ + if (psp->cc || psp->rc || psp->uc) + return 0; + + /* + * If there is no bus error, record is weird but we need not to recover. + */ + if (psp->bc == 0 || pbci == NULL) + return 1; + + /* + * Sorry, we cannot handle so many. + */ + if (peidx_bus_check_num(peidx) > 1) + return 0; + /* + * Well, here is only one bus error. + */ + if (pbci->ib || pbci->cc) + return 0; + if (pbci->eb && pbci->bsi > 0) + return 0; + if (psp->ci == 0) + return 0; + + /* + * This is a local MCA and estimated as recoverble external bus error. + * (e.g. a load from poisoned memory) + * This means "there are some platform errors". + */ + if (platform) + return recover_from_platform_error(slidx, peidx, pbci); + /* + * On account of strange SAL error record, we cannot recover. + */ + return 0; +} + +/** + * mca_try_to_recover - Try to recover from MCA + * @rec: pointer to a SAL error record + * + * Return value: + * 1 on Success / 0 on Failure + */ + +static int +mca_try_to_recover(void *rec, + ia64_mca_sal_to_os_state_t *sal_to_os_state, + ia64_mca_os_to_sal_state_t *os_to_sal_state) +{ + int platform_err; + int n_proc_err; + slidx_table_t slidx; + peidx_table_t peidx; + pal_bus_check_info_t pbci; + + /* handoff state from/to mca.c */ + sal_to_os_handoff_state = sal_to_os_state; + os_to_sal_handoff_state = os_to_sal_state; + + /* Make index of SAL error record */ + platform_err = mca_make_slidx(rec, &slidx); + + /* Count processor error sections */ + n_proc_err = slidx_count(&slidx, proc_err); + + /* Now, OS can recover when there is one processor error section */ + if (n_proc_err > 1) + return 0; + else if (n_proc_err == 0) { + /* Weird SAL record ... We need not to recover */ + + return 1; + } + + /* Make index of processor error section */ + mca_make_peidx((sal_log_processor_info_t*)slidx_first_entry(&slidx.proc_err)->hdr, &peidx); + + /* Extract Processor BUS_CHECK[0] */ + *((u64*)&pbci) = peidx_check_info(&peidx, bus_check, 0); + + /* Check whether MCA is global or not */ + if (is_mca_global(&peidx, &pbci)) + return 0; + + /* Try to recover a processor error */ + return recover_from_processor_error(platform_err, &slidx, &peidx, &pbci); +} + +/* + * ============================================================================= + */ + +int __init mca_external_handler_init(void) +{ + if (init_record_index_pools()) + return -ENOMEM; + + /* register external mca handlers */ + if (ia64_reg_MCA_extension(mca_try_to_recover)){ + printk(KERN_ERR "ia64_reg_MCA_extension failed.\n"); + kfree(slidx_pool.buffer); + return -EFAULT; + } + return 0; +} + +void __exit mca_external_handler_exit(void) +{ + /* unregister external mca handlers */ + ia64_unreg_MCA_extension(); + kfree(slidx_pool.buffer); +} + +module_init(mca_external_handler_init); +module_exit(mca_external_handler_exit); + +module_param(sal_rec_max, int, 0644); +MODULE_PARM_DESC(sal_rec_max, "Max size of SAL error record"); + +MODULE_DESCRIPTION("ia64 platform dependent mca handler driver"); +MODULE_LICENSE("GPL"); diff --git a/arch/ia64/kernel/mca_drv.h b/arch/ia64/kernel/mca_drv.h new file mode 100644 index 000000000000..0227b761f2c4 --- /dev/null +++ b/arch/ia64/kernel/mca_drv.h @@ -0,0 +1,113 @@ +/* + * File: mca_drv.h + * Purpose: Define helpers for Generic MCA handling + * + * Copyright (C) 2004 FUJITSU LIMITED + * Copyright (C) Hidetoshi Seto (seto.hidetoshi@jp.fujitsu.com) + */ +/* + * Processor error section: + * + * +-sal_log_processor_info_t *info-------------+ + * | sal_log_section_hdr_t header; | + * | ... | + * | sal_log_mod_error_info_t info[0]; | + * +-+----------------+-------------------------+ + * | CACHE_CHECK | ^ num_cache_check v + * +----------------+ + * | TLB_CHECK | ^ num_tlb_check v + * +----------------+ + * | BUS_CHECK | ^ num_bus_check v + * +----------------+ + * | REG_FILE_CHECK | ^ num_reg_file_check v + * +----------------+ + * | MS_CHECK | ^ num_ms_check v + * +-struct cpuid_info *id----------------------+ + * | regs[5]; | + * | reserved; | + * +-sal_processor_static_info_t *regs----------+ + * | valid; | + * | ... | + * | fr[128]; | + * +--------------------------------------------+ + */ + +/* peidx: index of processor error section */ +typedef struct peidx_table { + sal_log_processor_info_t *info; + struct sal_cpuid_info *id; + sal_processor_static_info_t *regs; +} peidx_table_t; + +#define peidx_head(p) (((p)->info)) +#define peidx_mid(p) (((p)->id)) +#define peidx_bottom(p) (((p)->regs)) + +#define peidx_psp(p) (&(peidx_head(p)->proc_state_parameter)) +#define peidx_field_valid(p) (&(peidx_head(p)->valid)) +#define peidx_minstate_area(p) (&(peidx_bottom(p)->min_state_area)) + +#define peidx_cache_check_num(p) (peidx_head(p)->valid.num_cache_check) +#define peidx_tlb_check_num(p) (peidx_head(p)->valid.num_tlb_check) +#define peidx_bus_check_num(p) (peidx_head(p)->valid.num_bus_check) +#define peidx_reg_file_check_num(p) (peidx_head(p)->valid.num_reg_file_check) +#define peidx_ms_check_num(p) (peidx_head(p)->valid.num_ms_check) + +#define peidx_cache_check_idx(p, n) (n) +#define peidx_tlb_check_idx(p, n) (peidx_cache_check_idx(p, peidx_cache_check_num(p)) + n) +#define peidx_bus_check_idx(p, n) (peidx_tlb_check_idx(p, peidx_tlb_check_num(p)) + n) +#define peidx_reg_file_check_idx(p, n) (peidx_bus_check_idx(p, peidx_bus_check_num(p)) + n) +#define peidx_ms_check_idx(p, n) (peidx_reg_file_check_idx(p, peidx_reg_file_check_num(p)) + n) + +#define peidx_mod_error_info(p, name, n) \ +({ int __idx = peidx_##name##_idx(p, n); \ + sal_log_mod_error_info_t *__ret = NULL; \ + if (peidx_##name##_num(p) > n) /*BUG*/ \ + __ret = &(peidx_head(p)->info[__idx]); \ + __ret; }) + +#define peidx_cache_check(p, n) peidx_mod_error_info(p, cache_check, n) +#define peidx_tlb_check(p, n) peidx_mod_error_info(p, tlb_check, n) +#define peidx_bus_check(p, n) peidx_mod_error_info(p, bus_check, n) +#define peidx_reg_file_check(p, n) peidx_mod_error_info(p, reg_file_check, n) +#define peidx_ms_check(p, n) peidx_mod_error_info(p, ms_check, n) + +#define peidx_check_info(proc, name, n) \ +({ \ + sal_log_mod_error_info_t *__info = peidx_mod_error_info(proc, name, n);\ + u64 __temp = __info && __info->valid.check_info \ + ? __info->check_info : 0; \ + __temp; }) + +/* slidx: index of SAL log error record */ + +typedef struct slidx_list { + struct list_head list; + sal_log_section_hdr_t *hdr; +} slidx_list_t; + +typedef struct slidx_table { + sal_log_record_header_t *header; + int n_sections; /* # of section headers */ + struct list_head proc_err; + struct list_head mem_dev_err; + struct list_head sel_dev_err; + struct list_head pci_bus_err; + struct list_head smbios_dev_err; + struct list_head pci_comp_err; + struct list_head plat_specific_err; + struct list_head host_ctlr_err; + struct list_head plat_bus_err; + struct list_head unsupported; /* list of unsupported sections */ +} slidx_table_t; + +#define slidx_foreach_entry(pos, head) \ + list_for_each_entry(pos, head, list) +#define slidx_first_entry(head) \ + (((head)->next != (head)) ? list_entry((head)->next, typeof(slidx_list_t), list) : NULL) +#define slidx_count(slidx, sec) \ +({ int __count = 0; \ + slidx_list_t *__pos; \ + slidx_foreach_entry(__pos, &((slidx)->sec)) { __count++; }\ + __count; }) + diff --git a/arch/ia64/kernel/mca_drv_asm.S b/arch/ia64/kernel/mca_drv_asm.S new file mode 100644 index 000000000000..bcfa05acc561 --- /dev/null +++ b/arch/ia64/kernel/mca_drv_asm.S @@ -0,0 +1,45 @@ +/* + * File: mca_drv_asm.S + * Purpose: Assembly portion of Generic MCA handling + * + * Copyright (C) 2004 FUJITSU LIMITED + * Copyright (C) Hidetoshi Seto (seto.hidetoshi@jp.fujitsu.com) + */ +#include <linux/config.h> +#include <linux/threads.h> + +#include <asm/asmmacro.h> +#include <asm/processor.h> + +GLOBAL_ENTRY(mca_handler_bhhook) + invala // clear RSE ? + ;; // + cover // + ;; // + clrrrb // + ;; + alloc r16=ar.pfs,0,2,1,0 // make a new frame + ;; + mov r13=IA64_KR(CURRENT) // current task pointer + ;; + adds r12=IA64_TASK_THREAD_KSP_OFFSET,r13 + ;; + ld8 r12=[r12] // stack pointer + ;; + mov loc0=r16 + movl loc1=mca_handler_bh // recovery C function + ;; + mov out0=r8 // poisoned address + mov b6=loc1 + ;; + mov loc1=rp + ;; + br.call.sptk.many rp=b6 // not return ... + ;; + mov ar.pfs=loc0 + mov rp=loc1 + ;; + mov r8=r0 + br.ret.sptk.many rp + ;; +END(mca_handler_bhhook) diff --git a/arch/ia64/kernel/minstate.h b/arch/ia64/kernel/minstate.h new file mode 100644 index 000000000000..1dbc7b2497c9 --- /dev/null +++ b/arch/ia64/kernel/minstate.h @@ -0,0 +1,251 @@ +#include <linux/config.h> + +#include <asm/cache.h> + +#include "entry.h" + +/* + * For ivt.s we want to access the stack virtually so we don't have to disable translation + * on interrupts. + * + * On entry: + * r1: pointer to current task (ar.k6) + */ +#define MINSTATE_START_SAVE_MIN_VIRT \ +(pUStk) mov ar.rsc=0; /* set enforced lazy mode, pl 0, little-endian, loadrs=0 */ \ + ;; \ +(pUStk) mov.m r24=ar.rnat; \ +(pUStk) addl r22=IA64_RBS_OFFSET,r1; /* compute base of RBS */ \ +(pKStk) mov r1=sp; /* get sp */ \ + ;; \ +(pUStk) lfetch.fault.excl.nt1 [r22]; \ +(pUStk) addl r1=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r1; /* compute base of memory stack */ \ +(pUStk) mov r23=ar.bspstore; /* save ar.bspstore */ \ + ;; \ +(pUStk) mov ar.bspstore=r22; /* switch to kernel RBS */ \ +(pKStk) addl r1=-IA64_PT_REGS_SIZE,r1; /* if in kernel mode, use sp (r12) */ \ + ;; \ +(pUStk) mov r18=ar.bsp; \ +(pUStk) mov ar.rsc=0x3; /* set eager mode, pl 0, little-endian, loadrs=0 */ + +#define MINSTATE_END_SAVE_MIN_VIRT \ + bsw.1; /* switch back to bank 1 (must be last in insn group) */ \ + ;; + +/* + * For mca_asm.S we want to access the stack physically since the state is saved before we + * go virtual and don't want to destroy the iip or ipsr. + */ +#define MINSTATE_START_SAVE_MIN_PHYS \ +(pKStk) mov r3=IA64_KR(PER_CPU_DATA);; \ +(pKStk) addl r3=THIS_CPU(ia64_mca_data),r3;; \ +(pKStk) ld8 r3 = [r3];; \ +(pKStk) addl r3=IA64_MCA_CPU_INIT_STACK_OFFSET,r3;; \ +(pKStk) addl sp=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r3; \ +(pUStk) mov ar.rsc=0; /* set enforced lazy mode, pl 0, little-endian, loadrs=0 */ \ +(pUStk) addl r22=IA64_RBS_OFFSET,r1; /* compute base of register backing store */ \ + ;; \ +(pUStk) mov r24=ar.rnat; \ +(pUStk) addl r1=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r1; /* compute base of memory stack */ \ +(pUStk) mov r23=ar.bspstore; /* save ar.bspstore */ \ +(pUStk) dep r22=-1,r22,61,3; /* compute kernel virtual addr of RBS */ \ + ;; \ +(pKStk) addl r1=-IA64_PT_REGS_SIZE,r1; /* if in kernel mode, use sp (r12) */ \ +(pUStk) mov ar.bspstore=r22; /* switch to kernel RBS */ \ + ;; \ +(pUStk) mov r18=ar.bsp; \ +(pUStk) mov ar.rsc=0x3; /* set eager mode, pl 0, little-endian, loadrs=0 */ \ + +#define MINSTATE_END_SAVE_MIN_PHYS \ + dep r12=-1,r12,61,3; /* make sp a kernel virtual address */ \ + ;; + +#ifdef MINSTATE_VIRT +# define MINSTATE_GET_CURRENT(reg) mov reg=IA64_KR(CURRENT) +# define MINSTATE_START_SAVE_MIN MINSTATE_START_SAVE_MIN_VIRT +# define MINSTATE_END_SAVE_MIN MINSTATE_END_SAVE_MIN_VIRT +#endif + +#ifdef MINSTATE_PHYS +# define MINSTATE_GET_CURRENT(reg) mov reg=IA64_KR(CURRENT);; tpa reg=reg +# define MINSTATE_START_SAVE_MIN MINSTATE_START_SAVE_MIN_PHYS +# define MINSTATE_END_SAVE_MIN MINSTATE_END_SAVE_MIN_PHYS +#endif + +/* + * DO_SAVE_MIN switches to the kernel stacks (if necessary) and saves + * the minimum state necessary that allows us to turn psr.ic back + * on. + * + * Assumed state upon entry: + * psr.ic: off + * r31: contains saved predicates (pr) + * + * Upon exit, the state is as follows: + * psr.ic: off + * r2 = points to &pt_regs.r16 + * r8 = contents of ar.ccv + * r9 = contents of ar.csd + * r10 = contents of ar.ssd + * r11 = FPSR_DEFAULT + * r12 = kernel sp (kernel virtual address) + * r13 = points to current task_struct (kernel virtual address) + * p15 = TRUE if psr.i is set in cr.ipsr + * predicate registers (other than p2, p3, and p15), b6, r3, r14, r15: + * preserved + * + * Note that psr.ic is NOT turned on by this macro. This is so that + * we can pass interruption state as arguments to a handler. + */ +#define DO_SAVE_MIN(COVER,SAVE_IFS,EXTRA) \ + MINSTATE_GET_CURRENT(r16); /* M (or M;;I) */ \ + mov r27=ar.rsc; /* M */ \ + mov r20=r1; /* A */ \ + mov r25=ar.unat; /* M */ \ + mov r29=cr.ipsr; /* M */ \ + mov r26=ar.pfs; /* I */ \ + mov r28=cr.iip; /* M */ \ + mov r21=ar.fpsr; /* M */ \ + COVER; /* B;; (or nothing) */ \ + ;; \ + adds r16=IA64_TASK_THREAD_ON_USTACK_OFFSET,r16; \ + ;; \ + ld1 r17=[r16]; /* load current->thread.on_ustack flag */ \ + st1 [r16]=r0; /* clear current->thread.on_ustack flag */ \ + adds r1=-IA64_TASK_THREAD_ON_USTACK_OFFSET,r16 \ + /* switch from user to kernel RBS: */ \ + ;; \ + invala; /* M */ \ + SAVE_IFS; \ + cmp.eq pKStk,pUStk=r0,r17; /* are we in kernel mode already? */ \ + ;; \ + MINSTATE_START_SAVE_MIN \ + adds r17=2*L1_CACHE_BYTES,r1; /* really: biggest cache-line size */ \ + adds r16=PT(CR_IPSR),r1; \ + ;; \ + lfetch.fault.excl.nt1 [r17],L1_CACHE_BYTES; \ + st8 [r16]=r29; /* save cr.ipsr */ \ + ;; \ + lfetch.fault.excl.nt1 [r17]; \ + tbit.nz p15,p0=r29,IA64_PSR_I_BIT; \ + mov r29=b0 \ + ;; \ + adds r16=PT(R8),r1; /* initialize first base pointer */ \ + adds r17=PT(R9),r1; /* initialize second base pointer */ \ +(pKStk) mov r18=r0; /* make sure r18 isn't NaT */ \ + ;; \ +.mem.offset 0,0; st8.spill [r16]=r8,16; \ +.mem.offset 8,0; st8.spill [r17]=r9,16; \ + ;; \ +.mem.offset 0,0; st8.spill [r16]=r10,24; \ +.mem.offset 8,0; st8.spill [r17]=r11,24; \ + ;; \ + st8 [r16]=r28,16; /* save cr.iip */ \ + st8 [r17]=r30,16; /* save cr.ifs */ \ +(pUStk) sub r18=r18,r22; /* r18=RSE.ndirty*8 */ \ + mov r8=ar.ccv; \ + mov r9=ar.csd; \ + mov r10=ar.ssd; \ + movl r11=FPSR_DEFAULT; /* L-unit */ \ + ;; \ + st8 [r16]=r25,16; /* save ar.unat */ \ + st8 [r17]=r26,16; /* save ar.pfs */ \ + shl r18=r18,16; /* compute ar.rsc to be used for "loadrs" */ \ + ;; \ + st8 [r16]=r27,16; /* save ar.rsc */ \ +(pUStk) st8 [r17]=r24,16; /* save ar.rnat */ \ +(pKStk) adds r17=16,r17; /* skip over ar_rnat field */ \ + ;; /* avoid RAW on r16 & r17 */ \ +(pUStk) st8 [r16]=r23,16; /* save ar.bspstore */ \ + st8 [r17]=r31,16; /* save predicates */ \ +(pKStk) adds r16=16,r16; /* skip over ar_bspstore field */ \ + ;; \ + st8 [r16]=r29,16; /* save b0 */ \ + st8 [r17]=r18,16; /* save ar.rsc value for "loadrs" */ \ + cmp.eq pNonSys,pSys=r0,r0 /* initialize pSys=0, pNonSys=1 */ \ + ;; \ +.mem.offset 0,0; st8.spill [r16]=r20,16; /* save original r1 */ \ +.mem.offset 8,0; st8.spill [r17]=r12,16; \ + adds r12=-16,r1; /* switch to kernel memory stack (with 16 bytes of scratch) */ \ + ;; \ +.mem.offset 0,0; st8.spill [r16]=r13,16; \ +.mem.offset 8,0; st8.spill [r17]=r21,16; /* save ar.fpsr */ \ + mov r13=IA64_KR(CURRENT); /* establish `current' */ \ + ;; \ +.mem.offset 0,0; st8.spill [r16]=r15,16; \ +.mem.offset 8,0; st8.spill [r17]=r14,16; \ + ;; \ +.mem.offset 0,0; st8.spill [r16]=r2,16; \ +.mem.offset 8,0; st8.spill [r17]=r3,16; \ + adds r2=IA64_PT_REGS_R16_OFFSET,r1; \ + ;; \ + EXTRA; \ + movl r1=__gp; /* establish kernel global pointer */ \ + ;; \ + MINSTATE_END_SAVE_MIN + +/* + * SAVE_REST saves the remainder of pt_regs (with psr.ic on). + * + * Assumed state upon entry: + * psr.ic: on + * r2: points to &pt_regs.r16 + * r3: points to &pt_regs.r17 + * r8: contents of ar.ccv + * r9: contents of ar.csd + * r10: contents of ar.ssd + * r11: FPSR_DEFAULT + * + * Registers r14 and r15 are guaranteed not to be touched by SAVE_REST. + */ +#define SAVE_REST \ +.mem.offset 0,0; st8.spill [r2]=r16,16; \ +.mem.offset 8,0; st8.spill [r3]=r17,16; \ + ;; \ +.mem.offset 0,0; st8.spill [r2]=r18,16; \ +.mem.offset 8,0; st8.spill [r3]=r19,16; \ + ;; \ +.mem.offset 0,0; st8.spill [r2]=r20,16; \ +.mem.offset 8,0; st8.spill [r3]=r21,16; \ + mov r18=b6; \ + ;; \ +.mem.offset 0,0; st8.spill [r2]=r22,16; \ +.mem.offset 8,0; st8.spill [r3]=r23,16; \ + mov r19=b7; \ + ;; \ +.mem.offset 0,0; st8.spill [r2]=r24,16; \ +.mem.offset 8,0; st8.spill [r3]=r25,16; \ + ;; \ +.mem.offset 0,0; st8.spill [r2]=r26,16; \ +.mem.offset 8,0; st8.spill [r3]=r27,16; \ + ;; \ +.mem.offset 0,0; st8.spill [r2]=r28,16; \ +.mem.offset 8,0; st8.spill [r3]=r29,16; \ + ;; \ +.mem.offset 0,0; st8.spill [r2]=r30,16; \ +.mem.offset 8,0; st8.spill [r3]=r31,32; \ + ;; \ + mov ar.fpsr=r11; /* M-unit */ \ + st8 [r2]=r8,8; /* ar.ccv */ \ + adds r24=PT(B6)-PT(F7),r3; \ + ;; \ + stf.spill [r2]=f6,32; \ + stf.spill [r3]=f7,32; \ + ;; \ + stf.spill [r2]=f8,32; \ + stf.spill [r3]=f9,32; \ + ;; \ + stf.spill [r2]=f10; \ + stf.spill [r3]=f11; \ + adds r25=PT(B7)-PT(F11),r3; \ + ;; \ + st8 [r24]=r18,16; /* b6 */ \ + st8 [r25]=r19,16; /* b7 */ \ + ;; \ + st8 [r24]=r9; /* ar.csd */ \ + st8 [r25]=r10; /* ar.ssd */ \ + ;; + +#define SAVE_MIN_WITH_COVER DO_SAVE_MIN(cover, mov r30=cr.ifs,) +#define SAVE_MIN_WITH_COVER_R19 DO_SAVE_MIN(cover, mov r30=cr.ifs, mov r15=r19) +#define SAVE_MIN DO_SAVE_MIN( , mov r30=r0, ) diff --git a/arch/ia64/kernel/module.c b/arch/ia64/kernel/module.c new file mode 100644 index 000000000000..febc091c2f02 --- /dev/null +++ b/arch/ia64/kernel/module.c @@ -0,0 +1,952 @@ +/* + * IA-64-specific support for kernel module loader. + * + * Copyright (C) 2003 Hewlett-Packard Co + * David Mosberger-Tang <davidm@hpl.hp.com> + * + * Loosely based on patch by Rusty Russell. + */ + +/* relocs tested so far: + + DIR64LSB + FPTR64LSB + GPREL22 + LDXMOV + LDXMOV + LTOFF22 + LTOFF22X + LTOFF22X + LTOFF_FPTR22 + PCREL21B (for br.call only; br.cond is not supported out of modules!) + PCREL60B (for brl.cond only; brl.call is not supported for modules!) + PCREL64LSB + SECREL32LSB + SEGREL64LSB + */ + +#include <linux/config.h> + +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/elf.h> +#include <linux/moduleloader.h> +#include <linux/string.h> +#include <linux/vmalloc.h> + +#include <asm/patch.h> +#include <asm/unaligned.h> + +#define ARCH_MODULE_DEBUG 0 + +#if ARCH_MODULE_DEBUG +# define DEBUGP printk +# define inline +#else +# define DEBUGP(fmt , a...) +#endif + +#ifdef CONFIG_ITANIUM +# define USE_BRL 0 +#else +# define USE_BRL 1 +#endif + +#define MAX_LTOFF ((uint64_t) (1 << 22)) /* max. allowable linkage-table offset */ + +/* Define some relocation helper macros/types: */ + +#define FORMAT_SHIFT 0 +#define FORMAT_BITS 3 +#define FORMAT_MASK ((1 << FORMAT_BITS) - 1) +#define VALUE_SHIFT 3 +#define VALUE_BITS 5 +#define VALUE_MASK ((1 << VALUE_BITS) - 1) + +enum reloc_target_format { + /* direct encoded formats: */ + RF_NONE = 0, + RF_INSN14 = 1, + RF_INSN22 = 2, + RF_INSN64 = 3, + RF_32MSB = 4, + RF_32LSB = 5, + RF_64MSB = 6, + RF_64LSB = 7, + + /* formats that cannot be directly decoded: */ + RF_INSN60, + RF_INSN21B, /* imm21 form 1 */ + RF_INSN21M, /* imm21 form 2 */ + RF_INSN21F /* imm21 form 3 */ +}; + +enum reloc_value_formula { + RV_DIRECT = 4, /* S + A */ + RV_GPREL = 5, /* @gprel(S + A) */ + RV_LTREL = 6, /* @ltoff(S + A) */ + RV_PLTREL = 7, /* @pltoff(S + A) */ + RV_FPTR = 8, /* @fptr(S + A) */ + RV_PCREL = 9, /* S + A - P */ + RV_LTREL_FPTR = 10, /* @ltoff(@fptr(S + A)) */ + RV_SEGREL = 11, /* @segrel(S + A) */ + RV_SECREL = 12, /* @secrel(S + A) */ + RV_BDREL = 13, /* BD + A */ + RV_LTV = 14, /* S + A (like RV_DIRECT, except frozen at static link-time) */ + RV_PCREL2 = 15, /* S + A - P */ + RV_SPECIAL = 16, /* various (see below) */ + RV_RSVD17 = 17, + RV_TPREL = 18, /* @tprel(S + A) */ + RV_LTREL_TPREL = 19, /* @ltoff(@tprel(S + A)) */ + RV_DTPMOD = 20, /* @dtpmod(S + A) */ + RV_LTREL_DTPMOD = 21, /* @ltoff(@dtpmod(S + A)) */ + RV_DTPREL = 22, /* @dtprel(S + A) */ + RV_LTREL_DTPREL = 23, /* @ltoff(@dtprel(S + A)) */ + RV_RSVD24 = 24, + RV_RSVD25 = 25, + RV_RSVD26 = 26, + RV_RSVD27 = 27 + /* 28-31 reserved for implementation-specific purposes. */ +}; + +#define N(reloc) [R_IA64_##reloc] = #reloc + +static const char *reloc_name[256] = { + N(NONE), N(IMM14), N(IMM22), N(IMM64), + N(DIR32MSB), N(DIR32LSB), N(DIR64MSB), N(DIR64LSB), + N(GPREL22), N(GPREL64I), N(GPREL32MSB), N(GPREL32LSB), + N(GPREL64MSB), N(GPREL64LSB), N(LTOFF22), N(LTOFF64I), + N(PLTOFF22), N(PLTOFF64I), N(PLTOFF64MSB), N(PLTOFF64LSB), + N(FPTR64I), N(FPTR32MSB), N(FPTR32LSB), N(FPTR64MSB), + N(FPTR64LSB), N(PCREL60B), N(PCREL21B), N(PCREL21M), + N(PCREL21F), N(PCREL32MSB), N(PCREL32LSB), N(PCREL64MSB), + N(PCREL64LSB), N(LTOFF_FPTR22), N(LTOFF_FPTR64I), N(LTOFF_FPTR32MSB), + N(LTOFF_FPTR32LSB), N(LTOFF_FPTR64MSB), N(LTOFF_FPTR64LSB), N(SEGREL32MSB), + N(SEGREL32LSB), N(SEGREL64MSB), N(SEGREL64LSB), N(SECREL32MSB), + N(SECREL32LSB), N(SECREL64MSB), N(SECREL64LSB), N(REL32MSB), + N(REL32LSB), N(REL64MSB), N(REL64LSB), N(LTV32MSB), + N(LTV32LSB), N(LTV64MSB), N(LTV64LSB), N(PCREL21BI), + N(PCREL22), N(PCREL64I), N(IPLTMSB), N(IPLTLSB), + N(COPY), N(LTOFF22X), N(LDXMOV), N(TPREL14), + N(TPREL22), N(TPREL64I), N(TPREL64MSB), N(TPREL64LSB), + N(LTOFF_TPREL22), N(DTPMOD64MSB), N(DTPMOD64LSB), N(LTOFF_DTPMOD22), + N(DTPREL14), N(DTPREL22), N(DTPREL64I), N(DTPREL32MSB), + N(DTPREL32LSB), N(DTPREL64MSB), N(DTPREL64LSB), N(LTOFF_DTPREL22) +}; + +#undef N + +struct got_entry { + uint64_t val; +}; + +struct fdesc { + uint64_t ip; + uint64_t gp; +}; + +/* Opaque struct for insns, to protect against derefs. */ +struct insn; + +static inline uint64_t +bundle (const struct insn *insn) +{ + return (uint64_t) insn & ~0xfUL; +} + +static inline int +slot (const struct insn *insn) +{ + return (uint64_t) insn & 0x3; +} + +static int +apply_imm64 (struct module *mod, struct insn *insn, uint64_t val) +{ + if (slot(insn) != 2) { + printk(KERN_ERR "%s: invalid slot number %d for IMM64\n", + mod->name, slot(insn)); + return 0; + } + ia64_patch_imm64((u64) insn, val); + return 1; +} + +static int +apply_imm60 (struct module *mod, struct insn *insn, uint64_t val) +{ + if (slot(insn) != 2) { + printk(KERN_ERR "%s: invalid slot number %d for IMM60\n", + mod->name, slot(insn)); + return 0; + } + if (val + ((uint64_t) 1 << 59) >= (1UL << 60)) { + printk(KERN_ERR "%s: value %ld out of IMM60 range\n", mod->name, (int64_t) val); + return 0; + } + ia64_patch_imm60((u64) insn, val); + return 1; +} + +static int +apply_imm22 (struct module *mod, struct insn *insn, uint64_t val) +{ + if (val + (1 << 21) >= (1 << 22)) { + printk(KERN_ERR "%s: value %li out of IMM22 range\n", mod->name, (int64_t)val); + return 0; + } + ia64_patch((u64) insn, 0x01fffcfe000UL, ( ((val & 0x200000UL) << 15) /* bit 21 -> 36 */ + | ((val & 0x1f0000UL) << 6) /* bit 16 -> 22 */ + | ((val & 0x00ff80UL) << 20) /* bit 7 -> 27 */ + | ((val & 0x00007fUL) << 13) /* bit 0 -> 13 */)); + return 1; +} + +static int +apply_imm21b (struct module *mod, struct insn *insn, uint64_t val) +{ + if (val + (1 << 20) >= (1 << 21)) { + printk(KERN_ERR "%s: value %li out of IMM21b range\n", mod->name, (int64_t)val); + return 0; + } + ia64_patch((u64) insn, 0x11ffffe000UL, ( ((val & 0x100000UL) << 16) /* bit 20 -> 36 */ + | ((val & 0x0fffffUL) << 13) /* bit 0 -> 13 */)); + return 1; +} + +#if USE_BRL + +struct plt_entry { + /* Three instruction bundles in PLT. */ + unsigned char bundle[2][16]; +}; + +static const struct plt_entry ia64_plt_template = { + { + { + 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, /* [MLX] nop.m 0 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x20, /* movl gp=TARGET_GP */ + 0x00, 0x00, 0x00, 0x60 + }, + { + 0x05, 0x00, 0x00, 0x00, 0x01, 0x00, /* [MLX] nop.m 0 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* brl.many gp=TARGET_GP */ + 0x08, 0x00, 0x00, 0xc0 + } + } +}; + +static int +patch_plt (struct module *mod, struct plt_entry *plt, long target_ip, unsigned long target_gp) +{ + if (apply_imm64(mod, (struct insn *) (plt->bundle[0] + 2), target_gp) + && apply_imm60(mod, (struct insn *) (plt->bundle[1] + 2), + (target_ip - (int64_t) plt->bundle[1]) / 16)) + return 1; + return 0; +} + +unsigned long +plt_target (struct plt_entry *plt) +{ + uint64_t b0, b1, *b = (uint64_t *) plt->bundle[1]; + long off; + + b0 = b[0]; b1 = b[1]; + off = ( ((b1 & 0x00fffff000000000UL) >> 36) /* imm20b -> bit 0 */ + | ((b0 >> 48) << 20) | ((b1 & 0x7fffffUL) << 36) /* imm39 -> bit 20 */ + | ((b1 & 0x0800000000000000UL) << 0)); /* i -> bit 59 */ + return (long) plt->bundle[1] + 16*off; +} + +#else /* !USE_BRL */ + +struct plt_entry { + /* Three instruction bundles in PLT. */ + unsigned char bundle[3][16]; +}; + +static const struct plt_entry ia64_plt_template = { + { + { + 0x05, 0x00, 0x00, 0x00, 0x01, 0x00, /* [MLX] nop.m 0 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* movl r16=TARGET_IP */ + 0x02, 0x00, 0x00, 0x60 + }, + { + 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, /* [MLX] nop.m 0 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x20, /* movl gp=TARGET_GP */ + 0x00, 0x00, 0x00, 0x60 + }, + { + 0x11, 0x00, 0x00, 0x00, 0x01, 0x00, /* [MIB] nop.m 0 */ + 0x60, 0x80, 0x04, 0x80, 0x03, 0x00, /* mov b6=r16 */ + 0x60, 0x00, 0x80, 0x00 /* br.few b6 */ + } + } +}; + +static int +patch_plt (struct module *mod, struct plt_entry *plt, long target_ip, unsigned long target_gp) +{ + if (apply_imm64(mod, (struct insn *) (plt->bundle[0] + 2), target_ip) + && apply_imm64(mod, (struct insn *) (plt->bundle[1] + 2), target_gp)) + return 1; + return 0; +} + +unsigned long +plt_target (struct plt_entry *plt) +{ + uint64_t b0, b1, *b = (uint64_t *) plt->bundle[0]; + + b0 = b[0]; b1 = b[1]; + return ( ((b1 & 0x000007f000000000) >> 36) /* imm7b -> bit 0 */ + | ((b1 & 0x07fc000000000000) >> 43) /* imm9d -> bit 7 */ + | ((b1 & 0x0003e00000000000) >> 29) /* imm5c -> bit 16 */ + | ((b1 & 0x0000100000000000) >> 23) /* ic -> bit 21 */ + | ((b0 >> 46) << 22) | ((b1 & 0x7fffff) << 40) /* imm41 -> bit 22 */ + | ((b1 & 0x0800000000000000) << 4)); /* i -> bit 63 */ +} + +#endif /* !USE_BRL */ + +void * +module_alloc (unsigned long size) +{ + if (!size) + return NULL; + return vmalloc(size); +} + +void +module_free (struct module *mod, void *module_region) +{ + if (mod->arch.init_unw_table && module_region == mod->module_init) { + unw_remove_unwind_table(mod->arch.init_unw_table); + mod->arch.init_unw_table = NULL; + } + vfree(module_region); +} + +/* Have we already seen one of these relocations? */ +/* FIXME: we could look in other sections, too --RR */ +static int +duplicate_reloc (const Elf64_Rela *rela, unsigned int num) +{ + unsigned int i; + + for (i = 0; i < num; i++) { + if (rela[i].r_info == rela[num].r_info && rela[i].r_addend == rela[num].r_addend) + return 1; + } + return 0; +} + +/* Count how many GOT entries we may need */ +static unsigned int +count_gots (const Elf64_Rela *rela, unsigned int num) +{ + unsigned int i, ret = 0; + + /* Sure, this is order(n^2), but it's usually short, and not + time critical */ + for (i = 0; i < num; i++) { + switch (ELF64_R_TYPE(rela[i].r_info)) { + case R_IA64_LTOFF22: + case R_IA64_LTOFF22X: + case R_IA64_LTOFF64I: + case R_IA64_LTOFF_FPTR22: + case R_IA64_LTOFF_FPTR64I: + case R_IA64_LTOFF_FPTR32MSB: + case R_IA64_LTOFF_FPTR32LSB: + case R_IA64_LTOFF_FPTR64MSB: + case R_IA64_LTOFF_FPTR64LSB: + if (!duplicate_reloc(rela, i)) + ret++; + break; + } + } + return ret; +} + +/* Count how many PLT entries we may need */ +static unsigned int +count_plts (const Elf64_Rela *rela, unsigned int num) +{ + unsigned int i, ret = 0; + + /* Sure, this is order(n^2), but it's usually short, and not + time critical */ + for (i = 0; i < num; i++) { + switch (ELF64_R_TYPE(rela[i].r_info)) { + case R_IA64_PCREL21B: + case R_IA64_PLTOFF22: + case R_IA64_PLTOFF64I: + case R_IA64_PLTOFF64MSB: + case R_IA64_PLTOFF64LSB: + case R_IA64_IPLTMSB: + case R_IA64_IPLTLSB: + if (!duplicate_reloc(rela, i)) + ret++; + break; + } + } + return ret; +} + +/* We need to create an function-descriptors for any internal function + which is referenced. */ +static unsigned int +count_fdescs (const Elf64_Rela *rela, unsigned int num) +{ + unsigned int i, ret = 0; + + /* Sure, this is order(n^2), but it's usually short, and not time critical. */ + for (i = 0; i < num; i++) { + switch (ELF64_R_TYPE(rela[i].r_info)) { + case R_IA64_FPTR64I: + case R_IA64_FPTR32LSB: + case R_IA64_FPTR32MSB: + case R_IA64_FPTR64LSB: + case R_IA64_FPTR64MSB: + case R_IA64_LTOFF_FPTR22: + case R_IA64_LTOFF_FPTR32LSB: + case R_IA64_LTOFF_FPTR32MSB: + case R_IA64_LTOFF_FPTR64I: + case R_IA64_LTOFF_FPTR64LSB: + case R_IA64_LTOFF_FPTR64MSB: + case R_IA64_IPLTMSB: + case R_IA64_IPLTLSB: + /* + * Jumps to static functions sometimes go straight to their + * offset. Of course, that may not be possible if the jump is + * from init -> core or vice. versa, so we need to generate an + * FDESC (and PLT etc) for that. + */ + case R_IA64_PCREL21B: + if (!duplicate_reloc(rela, i)) + ret++; + break; + } + } + return ret; +} + +int +module_frob_arch_sections (Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, char *secstrings, + struct module *mod) +{ + unsigned long core_plts = 0, init_plts = 0, gots = 0, fdescs = 0; + Elf64_Shdr *s, *sechdrs_end = sechdrs + ehdr->e_shnum; + + /* + * To store the PLTs and function-descriptors, we expand the .text section for + * core module-code and the .init.text section for initialization code. + */ + for (s = sechdrs; s < sechdrs_end; ++s) + if (strcmp(".core.plt", secstrings + s->sh_name) == 0) + mod->arch.core_plt = s; + else if (strcmp(".init.plt", secstrings + s->sh_name) == 0) + mod->arch.init_plt = s; + else if (strcmp(".got", secstrings + s->sh_name) == 0) + mod->arch.got = s; + else if (strcmp(".opd", secstrings + s->sh_name) == 0) + mod->arch.opd = s; + else if (strcmp(".IA_64.unwind", secstrings + s->sh_name) == 0) + mod->arch.unwind = s; + + if (!mod->arch.core_plt || !mod->arch.init_plt || !mod->arch.got || !mod->arch.opd) { + printk(KERN_ERR "%s: sections missing\n", mod->name); + return -ENOEXEC; + } + + /* GOT and PLTs can occur in any relocated section... */ + for (s = sechdrs + 1; s < sechdrs_end; ++s) { + const Elf64_Rela *rels = (void *)ehdr + s->sh_offset; + unsigned long numrels = s->sh_size/sizeof(Elf64_Rela); + + if (s->sh_type != SHT_RELA) + continue; + + gots += count_gots(rels, numrels); + fdescs += count_fdescs(rels, numrels); + if (strstr(secstrings + s->sh_name, ".init")) + init_plts += count_plts(rels, numrels); + else + core_plts += count_plts(rels, numrels); + } + + mod->arch.core_plt->sh_type = SHT_NOBITS; + mod->arch.core_plt->sh_flags = SHF_EXECINSTR | SHF_ALLOC; + mod->arch.core_plt->sh_addralign = 16; + mod->arch.core_plt->sh_size = core_plts * sizeof(struct plt_entry); + mod->arch.init_plt->sh_type = SHT_NOBITS; + mod->arch.init_plt->sh_flags = SHF_EXECINSTR | SHF_ALLOC; + mod->arch.init_plt->sh_addralign = 16; + mod->arch.init_plt->sh_size = init_plts * sizeof(struct plt_entry); + mod->arch.got->sh_type = SHT_NOBITS; + mod->arch.got->sh_flags = ARCH_SHF_SMALL | SHF_ALLOC; + mod->arch.got->sh_addralign = 8; + mod->arch.got->sh_size = gots * sizeof(struct got_entry); + mod->arch.opd->sh_type = SHT_NOBITS; + mod->arch.opd->sh_flags = SHF_ALLOC; + mod->arch.opd->sh_addralign = 8; + mod->arch.opd->sh_size = fdescs * sizeof(struct fdesc); + DEBUGP("%s: core.plt=%lx, init.plt=%lx, got=%lx, fdesc=%lx\n", + __FUNCTION__, mod->arch.core_plt->sh_size, mod->arch.init_plt->sh_size, + mod->arch.got->sh_size, mod->arch.opd->sh_size); + return 0; +} + +static inline int +in_init (const struct module *mod, uint64_t addr) +{ + return addr - (uint64_t) mod->module_init < mod->init_size; +} + +static inline int +in_core (const struct module *mod, uint64_t addr) +{ + return addr - (uint64_t) mod->module_core < mod->core_size; +} + +static inline int +is_internal (const struct module *mod, uint64_t value) +{ + return in_init(mod, value) || in_core(mod, value); +} + +/* + * Get gp-relative offset for the linkage-table entry of VALUE. + */ +static uint64_t +get_ltoff (struct module *mod, uint64_t value, int *okp) +{ + struct got_entry *got, *e; + + if (!*okp) + return 0; + + got = (void *) mod->arch.got->sh_addr; + for (e = got; e < got + mod->arch.next_got_entry; ++e) + if (e->val == value) + goto found; + + /* Not enough GOT entries? */ + if (e >= (struct got_entry *) (mod->arch.got->sh_addr + mod->arch.got->sh_size)) + BUG(); + + e->val = value; + ++mod->arch.next_got_entry; + found: + return (uint64_t) e - mod->arch.gp; +} + +static inline int +gp_addressable (struct module *mod, uint64_t value) +{ + return value - mod->arch.gp + MAX_LTOFF/2 < MAX_LTOFF; +} + +/* Get PC-relative PLT entry for this value. Returns 0 on failure. */ +static uint64_t +get_plt (struct module *mod, const struct insn *insn, uint64_t value, int *okp) +{ + struct plt_entry *plt, *plt_end; + uint64_t target_ip, target_gp; + + if (!*okp) + return 0; + + if (in_init(mod, (uint64_t) insn)) { + plt = (void *) mod->arch.init_plt->sh_addr; + plt_end = (void *) plt + mod->arch.init_plt->sh_size; + } else { + plt = (void *) mod->arch.core_plt->sh_addr; + plt_end = (void *) plt + mod->arch.core_plt->sh_size; + } + + /* "value" is a pointer to a function-descriptor; fetch the target ip/gp from it: */ + target_ip = ((uint64_t *) value)[0]; + target_gp = ((uint64_t *) value)[1]; + + /* Look for existing PLT entry. */ + while (plt->bundle[0][0]) { + if (plt_target(plt) == target_ip) + goto found; + if (++plt >= plt_end) + BUG(); + } + *plt = ia64_plt_template; + if (!patch_plt(mod, plt, target_ip, target_gp)) { + *okp = 0; + return 0; + } +#if ARCH_MODULE_DEBUG + if (plt_target(plt) != target_ip) { + printk("%s: mistargeted PLT: wanted %lx, got %lx\n", + __FUNCTION__, target_ip, plt_target(plt)); + *okp = 0; + return 0; + } +#endif + found: + return (uint64_t) plt; +} + +/* Get function descriptor for VALUE. */ +static uint64_t +get_fdesc (struct module *mod, uint64_t value, int *okp) +{ + struct fdesc *fdesc = (void *) mod->arch.opd->sh_addr; + + if (!*okp) + return 0; + + if (!value) { + printk(KERN_ERR "%s: fdesc for zero requested!\n", mod->name); + return 0; + } + + if (!is_internal(mod, value)) + /* + * If it's not a module-local entry-point, "value" already points to a + * function-descriptor. + */ + return value; + + /* Look for existing function descriptor. */ + while (fdesc->ip) { + if (fdesc->ip == value) + return (uint64_t)fdesc; + if ((uint64_t) ++fdesc >= mod->arch.opd->sh_addr + mod->arch.opd->sh_size) + BUG(); + } + + /* Create new one */ + fdesc->ip = value; + fdesc->gp = mod->arch.gp; + return (uint64_t) fdesc; +} + +static inline int +do_reloc (struct module *mod, uint8_t r_type, Elf64_Sym *sym, uint64_t addend, + Elf64_Shdr *sec, void *location) +{ + enum reloc_target_format format = (r_type >> FORMAT_SHIFT) & FORMAT_MASK; + enum reloc_value_formula formula = (r_type >> VALUE_SHIFT) & VALUE_MASK; + uint64_t val; + int ok = 1; + + val = sym->st_value + addend; + + switch (formula) { + case RV_SEGREL: /* segment base is arbitrarily chosen to be 0 for kernel modules */ + case RV_DIRECT: + break; + + case RV_GPREL: val -= mod->arch.gp; break; + case RV_LTREL: val = get_ltoff(mod, val, &ok); break; + case RV_PLTREL: val = get_plt(mod, location, val, &ok); break; + case RV_FPTR: val = get_fdesc(mod, val, &ok); break; + case RV_SECREL: val -= sec->sh_addr; break; + case RV_LTREL_FPTR: val = get_ltoff(mod, get_fdesc(mod, val, &ok), &ok); break; + + case RV_PCREL: + switch (r_type) { + case R_IA64_PCREL21B: + if ((in_init(mod, val) && in_core(mod, (uint64_t)location)) || + (in_core(mod, val) && in_init(mod, (uint64_t)location))) { + /* + * Init section may have been allocated far away from core, + * if the branch won't reach, then allocate a plt for it. + */ + uint64_t delta = ((int64_t)val - (int64_t)location) / 16; + if (delta + (1 << 20) >= (1 << 21)) { + val = get_fdesc(mod, val, &ok); + val = get_plt(mod, location, val, &ok); + } + } else if (!is_internal(mod, val)) + val = get_plt(mod, location, val, &ok); + /* FALL THROUGH */ + default: + val -= bundle(location); + break; + + case R_IA64_PCREL32MSB: + case R_IA64_PCREL32LSB: + case R_IA64_PCREL64MSB: + case R_IA64_PCREL64LSB: + val -= (uint64_t) location; + break; + + } + switch (r_type) { + case R_IA64_PCREL60B: format = RF_INSN60; break; + case R_IA64_PCREL21B: format = RF_INSN21B; break; + case R_IA64_PCREL21M: format = RF_INSN21M; break; + case R_IA64_PCREL21F: format = RF_INSN21F; break; + default: break; + } + break; + + case RV_BDREL: + val -= (uint64_t) (in_init(mod, val) ? mod->module_init : mod->module_core); + break; + + case RV_LTV: + /* can link-time value relocs happen here? */ + BUG(); + break; + + case RV_PCREL2: + if (r_type == R_IA64_PCREL21BI) { + if (!is_internal(mod, val)) { + printk(KERN_ERR "%s: %s reloc against non-local symbol (%lx)\n", + __FUNCTION__, reloc_name[r_type], val); + return -ENOEXEC; + } + format = RF_INSN21B; + } + val -= bundle(location); + break; + + case RV_SPECIAL: + switch (r_type) { + case R_IA64_IPLTMSB: + case R_IA64_IPLTLSB: + val = get_fdesc(mod, get_plt(mod, location, val, &ok), &ok); + format = RF_64LSB; + if (r_type == R_IA64_IPLTMSB) + format = RF_64MSB; + break; + + case R_IA64_SUB: + val = addend - sym->st_value; + format = RF_INSN64; + break; + + case R_IA64_LTOFF22X: + if (gp_addressable(mod, val)) + val -= mod->arch.gp; + else + val = get_ltoff(mod, val, &ok); + format = RF_INSN22; + break; + + case R_IA64_LDXMOV: + if (gp_addressable(mod, val)) { + /* turn "ld8" into "mov": */ + DEBUGP("%s: patching ld8 at %p to mov\n", __FUNCTION__, location); + ia64_patch((u64) location, 0x1fff80fe000UL, 0x10000000000UL); + } + return 0; + + default: + if (reloc_name[r_type]) + printk(KERN_ERR "%s: special reloc %s not supported", + mod->name, reloc_name[r_type]); + else + printk(KERN_ERR "%s: unknown special reloc %x\n", + mod->name, r_type); + return -ENOEXEC; + } + break; + + case RV_TPREL: + case RV_LTREL_TPREL: + case RV_DTPMOD: + case RV_LTREL_DTPMOD: + case RV_DTPREL: + case RV_LTREL_DTPREL: + printk(KERN_ERR "%s: %s reloc not supported\n", + mod->name, reloc_name[r_type] ? reloc_name[r_type] : "?"); + return -ENOEXEC; + + default: + printk(KERN_ERR "%s: unknown reloc %x\n", mod->name, r_type); + return -ENOEXEC; + } + + if (!ok) + return -ENOEXEC; + + DEBUGP("%s: [%p]<-%016lx = %s(%lx)\n", __FUNCTION__, location, val, + reloc_name[r_type] ? reloc_name[r_type] : "?", sym->st_value + addend); + + switch (format) { + case RF_INSN21B: ok = apply_imm21b(mod, location, (int64_t) val / 16); break; + case RF_INSN22: ok = apply_imm22(mod, location, val); break; + case RF_INSN64: ok = apply_imm64(mod, location, val); break; + case RF_INSN60: ok = apply_imm60(mod, location, (int64_t) val / 16); break; + case RF_32LSB: put_unaligned(val, (uint32_t *) location); break; + case RF_64LSB: put_unaligned(val, (uint64_t *) location); break; + case RF_32MSB: /* ia64 Linux is little-endian... */ + case RF_64MSB: /* ia64 Linux is little-endian... */ + case RF_INSN14: /* must be within-module, i.e., resolved by "ld -r" */ + case RF_INSN21M: /* must be within-module, i.e., resolved by "ld -r" */ + case RF_INSN21F: /* must be within-module, i.e., resolved by "ld -r" */ + printk(KERN_ERR "%s: format %u needed by %s reloc is not supported\n", + mod->name, format, reloc_name[r_type] ? reloc_name[r_type] : "?"); + return -ENOEXEC; + + default: + printk(KERN_ERR "%s: relocation %s resulted in unknown format %u\n", + mod->name, reloc_name[r_type] ? reloc_name[r_type] : "?", format); + return -ENOEXEC; + } + return ok ? 0 : -ENOEXEC; +} + +int +apply_relocate_add (Elf64_Shdr *sechdrs, const char *strtab, unsigned int symindex, + unsigned int relsec, struct module *mod) +{ + unsigned int i, n = sechdrs[relsec].sh_size / sizeof(Elf64_Rela); + Elf64_Rela *rela = (void *) sechdrs[relsec].sh_addr; + Elf64_Shdr *target_sec; + int ret; + + DEBUGP("%s: applying section %u (%u relocs) to %u\n", __FUNCTION__, + relsec, n, sechdrs[relsec].sh_info); + + target_sec = sechdrs + sechdrs[relsec].sh_info; + + if (target_sec->sh_entsize == ~0UL) + /* + * If target section wasn't allocated, we don't need to relocate it. + * Happens, e.g., for debug sections. + */ + return 0; + + if (!mod->arch.gp) { + /* + * XXX Should have an arch-hook for running this after final section + * addresses have been selected... + */ + /* See if gp can cover the entire core module: */ + uint64_t gp = (uint64_t) mod->module_core + MAX_LTOFF / 2; + if (mod->core_size >= MAX_LTOFF) + /* + * This takes advantage of fact that SHF_ARCH_SMALL gets allocated + * at the end of the module. + */ + gp = (uint64_t) mod->module_core + mod->core_size - MAX_LTOFF / 2; + mod->arch.gp = gp; + DEBUGP("%s: placing gp at 0x%lx\n", __FUNCTION__, gp); + } + + for (i = 0; i < n; i++) { + ret = do_reloc(mod, ELF64_R_TYPE(rela[i].r_info), + ((Elf64_Sym *) sechdrs[symindex].sh_addr + + ELF64_R_SYM(rela[i].r_info)), + rela[i].r_addend, target_sec, + (void *) target_sec->sh_addr + rela[i].r_offset); + if (ret < 0) + return ret; + } + return 0; +} + +int +apply_relocate (Elf64_Shdr *sechdrs, const char *strtab, unsigned int symindex, + unsigned int relsec, struct module *mod) +{ + printk(KERN_ERR "module %s: REL relocs in section %u unsupported\n", mod->name, relsec); + return -ENOEXEC; +} + +/* + * Modules contain a single unwind table which covers both the core and the init text + * sections but since the two are not contiguous, we need to split this table up such that + * we can register (and unregister) each "segment" seperately. Fortunately, this sounds + * more complicated than it really is. + */ +static void +register_unwind_table (struct module *mod) +{ + struct unw_table_entry *start = (void *) mod->arch.unwind->sh_addr; + struct unw_table_entry *end = start + mod->arch.unwind->sh_size / sizeof (*start); + struct unw_table_entry tmp, *e1, *e2, *core, *init; + unsigned long num_init = 0, num_core = 0; + + /* First, count how many init and core unwind-table entries there are. */ + for (e1 = start; e1 < end; ++e1) + if (in_init(mod, e1->start_offset)) + ++num_init; + else + ++num_core; + /* + * Second, sort the table such that all unwind-table entries for the init and core + * text sections are nicely separated. We do this with a stupid bubble sort + * (unwind tables don't get ridiculously huge). + */ + for (e1 = start; e1 < end; ++e1) { + for (e2 = e1 + 1; e2 < end; ++e2) { + if (e2->start_offset < e1->start_offset) { + tmp = *e1; + *e1 = *e2; + *e2 = tmp; + } + } + } + /* + * Third, locate the init and core segments in the unwind table: + */ + if (in_init(mod, start->start_offset)) { + init = start; + core = start + num_init; + } else { + core = start; + init = start + num_core; + } + + DEBUGP("%s: name=%s, gp=%lx, num_init=%lu, num_core=%lu\n", __FUNCTION__, + mod->name, mod->arch.gp, num_init, num_core); + + /* + * Fourth, register both tables (if not empty). + */ + if (num_core > 0) { + mod->arch.core_unw_table = unw_add_unwind_table(mod->name, 0, mod->arch.gp, + core, core + num_core); + DEBUGP("%s: core: handle=%p [%p-%p)\n", __FUNCTION__, + mod->arch.core_unw_table, core, core + num_core); + } + if (num_init > 0) { + mod->arch.init_unw_table = unw_add_unwind_table(mod->name, 0, mod->arch.gp, + init, init + num_init); + DEBUGP("%s: init: handle=%p [%p-%p)\n", __FUNCTION__, + mod->arch.init_unw_table, init, init + num_init); + } +} + +int +module_finalize (const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs, struct module *mod) +{ + DEBUGP("%s: init: entry=%p\n", __FUNCTION__, mod->init); + if (mod->arch.unwind) + register_unwind_table(mod); + return 0; +} + +void +module_arch_cleanup (struct module *mod) +{ + if (mod->arch.init_unw_table) + unw_remove_unwind_table(mod->arch.init_unw_table); + if (mod->arch.core_unw_table) + unw_remove_unwind_table(mod->arch.core_unw_table); +} + +#ifdef CONFIG_SMP +void +percpu_modcopy (void *pcpudst, const void *src, unsigned long size) +{ + unsigned int i; + for (i = 0; i < NR_CPUS; i++) + if (cpu_possible(i)) + memcpy(pcpudst + __per_cpu_offset[i], src, size); +} +#endif /* CONFIG_SMP */ diff --git a/arch/ia64/kernel/pal.S b/arch/ia64/kernel/pal.S new file mode 100644 index 000000000000..5018c7f2e7a8 --- /dev/null +++ b/arch/ia64/kernel/pal.S @@ -0,0 +1,302 @@ +/* + * PAL Firmware support + * IA-64 Processor Programmers Reference Vol 2 + * + * Copyright (C) 1999 Don Dugger <don.dugger@intel.com> + * Copyright (C) 1999 Walt Drummond <drummond@valinux.com> + * Copyright (C) 1999-2001, 2003 Hewlett-Packard Co + * David Mosberger <davidm@hpl.hp.com> + * Stephane Eranian <eranian@hpl.hp.com> + * + * 05/22/2000 eranian Added support for stacked register calls + * 05/24/2000 eranian Added support for physical mode static calls + */ + +#include <asm/asmmacro.h> +#include <asm/processor.h> + + .data +pal_entry_point: + data8 ia64_pal_default_handler + .text + +/* + * Set the PAL entry point address. This could be written in C code, but we do it here + * to keep it all in one module (besides, it's so trivial that it's + * not a big deal). + * + * in0 Address of the PAL entry point (text address, NOT a function descriptor). + */ +GLOBAL_ENTRY(ia64_pal_handler_init) + alloc r3=ar.pfs,1,0,0,0 + movl r2=pal_entry_point + ;; + st8 [r2]=in0 + br.ret.sptk.many rp +END(ia64_pal_handler_init) + +/* + * Default PAL call handler. This needs to be coded in assembly because it uses + * the static calling convention, i.e., the RSE may not be used and calls are + * done via "br.cond" (not "br.call"). + */ +GLOBAL_ENTRY(ia64_pal_default_handler) + mov r8=-1 + br.cond.sptk.many rp +END(ia64_pal_default_handler) + +/* + * Make a PAL call using the static calling convention. + * + * in0 Index of PAL service + * in1 - in3 Remaining PAL arguments + * in4 1 ==> clear psr.ic, 0 ==> don't clear psr.ic + * + */ +GLOBAL_ENTRY(ia64_pal_call_static) + .prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(5) + alloc loc1 = ar.pfs,5,5,0,0 + movl loc2 = pal_entry_point +1: { + mov r28 = in0 + mov r29 = in1 + mov r8 = ip + } + ;; + ld8 loc2 = [loc2] // loc2 <- entry point + tbit.nz p6,p7 = in4, 0 + adds r8 = 1f-1b,r8 + mov loc4=ar.rsc // save RSE configuration + ;; + mov ar.rsc=0 // put RSE in enforced lazy, LE mode + mov loc3 = psr + mov loc0 = rp + .body + mov r30 = in2 + +(p6) rsm psr.i | psr.ic + mov r31 = in3 + mov b7 = loc2 + +(p7) rsm psr.i + ;; +(p6) srlz.i + mov rp = r8 + br.cond.sptk.many b7 +1: mov psr.l = loc3 + mov ar.rsc = loc4 // restore RSE configuration + mov ar.pfs = loc1 + mov rp = loc0 + ;; + srlz.d // seralize restoration of psr.l + br.ret.sptk.many b0 +END(ia64_pal_call_static) + +/* + * Make a PAL call using the stacked registers calling convention. + * + * Inputs: + * in0 Index of PAL service + * in2 - in3 Remaning PAL arguments + */ +GLOBAL_ENTRY(ia64_pal_call_stacked) + .prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(4) + alloc loc1 = ar.pfs,4,4,4,0 + movl loc2 = pal_entry_point + + mov r28 = in0 // Index MUST be copied to r28 + mov out0 = in0 // AND in0 of PAL function + mov loc0 = rp + .body + ;; + ld8 loc2 = [loc2] // loc2 <- entry point + mov out1 = in1 + mov out2 = in2 + mov out3 = in3 + mov loc3 = psr + ;; + rsm psr.i + mov b7 = loc2 + ;; + br.call.sptk.many rp=b7 // now make the call +.ret0: mov psr.l = loc3 + mov ar.pfs = loc1 + mov rp = loc0 + ;; + srlz.d // serialize restoration of psr.l + br.ret.sptk.many b0 +END(ia64_pal_call_stacked) + +/* + * Make a physical mode PAL call using the static registers calling convention. + * + * Inputs: + * in0 Index of PAL service + * in2 - in3 Remaning PAL arguments + * + * PSR_LP, PSR_TB, PSR_ID, PSR_DA are never set by the kernel. + * So we don't need to clear them. + */ +#define PAL_PSR_BITS_TO_CLEAR \ + (IA64_PSR_I | IA64_PSR_IT | IA64_PSR_DT | IA64_PSR_DB | IA64_PSR_RT | \ + IA64_PSR_DD | IA64_PSR_SS | IA64_PSR_RI | IA64_PSR_ED | \ + IA64_PSR_DFL | IA64_PSR_DFH) + +#define PAL_PSR_BITS_TO_SET \ + (IA64_PSR_BN) + + +GLOBAL_ENTRY(ia64_pal_call_phys_static) + .prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(4) + alloc loc1 = ar.pfs,4,7,0,0 + movl loc2 = pal_entry_point +1: { + mov r28 = in0 // copy procedure index + mov r8 = ip // save ip to compute branch + mov loc0 = rp // save rp + } + .body + ;; + ld8 loc2 = [loc2] // loc2 <- entry point + mov r29 = in1 // first argument + mov r30 = in2 // copy arg2 + mov r31 = in3 // copy arg3 + ;; + mov loc3 = psr // save psr + adds r8 = 1f-1b,r8 // calculate return address for call + ;; + mov loc4=ar.rsc // save RSE configuration + dep.z loc2=loc2,0,61 // convert pal entry point to physical + tpa r8=r8 // convert rp to physical + ;; + mov b7 = loc2 // install target to branch reg + mov ar.rsc=0 // put RSE in enforced lazy, LE mode + movl r16=PAL_PSR_BITS_TO_CLEAR + movl r17=PAL_PSR_BITS_TO_SET + ;; + or loc3=loc3,r17 // add in psr the bits to set + ;; + andcm r16=loc3,r16 // removes bits to clear from psr + br.call.sptk.many rp=ia64_switch_mode_phys +.ret1: mov rp = r8 // install return address (physical) + mov loc5 = r19 + mov loc6 = r20 + br.cond.sptk.many b7 +1: + mov ar.rsc=0 // put RSE in enforced lazy, LE mode + mov r16=loc3 // r16= original psr + mov r19=loc5 + mov r20=loc6 + br.call.sptk.many rp=ia64_switch_mode_virt // return to virtual mode +.ret2: + mov psr.l = loc3 // restore init PSR + + mov ar.pfs = loc1 + mov rp = loc0 + ;; + mov ar.rsc=loc4 // restore RSE configuration + srlz.d // seralize restoration of psr.l + br.ret.sptk.many b0 +END(ia64_pal_call_phys_static) + +/* + * Make a PAL call using the stacked registers in physical mode. + * + * Inputs: + * in0 Index of PAL service + * in2 - in3 Remaning PAL arguments + */ +GLOBAL_ENTRY(ia64_pal_call_phys_stacked) + .prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(5) + alloc loc1 = ar.pfs,5,7,4,0 + movl loc2 = pal_entry_point +1: { + mov r28 = in0 // copy procedure index + mov loc0 = rp // save rp + } + .body + ;; + ld8 loc2 = [loc2] // loc2 <- entry point + mov out0 = in0 // first argument + mov out1 = in1 // copy arg2 + mov out2 = in2 // copy arg3 + mov out3 = in3 // copy arg3 + ;; + mov loc3 = psr // save psr + ;; + mov loc4=ar.rsc // save RSE configuration + dep.z loc2=loc2,0,61 // convert pal entry point to physical + ;; + mov ar.rsc=0 // put RSE in enforced lazy, LE mode + movl r16=PAL_PSR_BITS_TO_CLEAR + movl r17=PAL_PSR_BITS_TO_SET + ;; + or loc3=loc3,r17 // add in psr the bits to set + mov b7 = loc2 // install target to branch reg + ;; + andcm r16=loc3,r16 // removes bits to clear from psr + br.call.sptk.many rp=ia64_switch_mode_phys +.ret6: + mov loc5 = r19 + mov loc6 = r20 + br.call.sptk.many rp=b7 // now make the call +.ret7: + mov ar.rsc=0 // put RSE in enforced lazy, LE mode + mov r16=loc3 // r16= original psr + mov r19=loc5 + mov r20=loc6 + br.call.sptk.many rp=ia64_switch_mode_virt // return to virtual mode + +.ret8: mov psr.l = loc3 // restore init PSR + mov ar.pfs = loc1 + mov rp = loc0 + ;; + mov ar.rsc=loc4 // restore RSE configuration + srlz.d // seralize restoration of psr.l + br.ret.sptk.many b0 +END(ia64_pal_call_phys_stacked) + +/* + * Save scratch fp scratch regs which aren't saved in pt_regs already (fp10-fp15). + * + * NOTE: We need to do this since firmware (SAL and PAL) may use any of the scratch + * regs fp-low partition. + * + * Inputs: + * in0 Address of stack storage for fp regs + */ +GLOBAL_ENTRY(ia64_save_scratch_fpregs) + alloc r3=ar.pfs,1,0,0,0 + add r2=16,in0 + ;; + stf.spill [in0] = f10,32 + stf.spill [r2] = f11,32 + ;; + stf.spill [in0] = f12,32 + stf.spill [r2] = f13,32 + ;; + stf.spill [in0] = f14,32 + stf.spill [r2] = f15,32 + br.ret.sptk.many rp +END(ia64_save_scratch_fpregs) + +/* + * Load scratch fp scratch regs (fp10-fp15) + * + * Inputs: + * in0 Address of stack storage for fp regs + */ +GLOBAL_ENTRY(ia64_load_scratch_fpregs) + alloc r3=ar.pfs,1,0,0,0 + add r2=16,in0 + ;; + ldf.fill f10 = [in0],32 + ldf.fill f11 = [r2],32 + ;; + ldf.fill f12 = [in0],32 + ldf.fill f13 = [r2],32 + ;; + ldf.fill f14 = [in0],32 + ldf.fill f15 = [r2],32 + br.ret.sptk.many rp +END(ia64_load_scratch_fpregs) diff --git a/arch/ia64/kernel/palinfo.c b/arch/ia64/kernel/palinfo.c new file mode 100644 index 000000000000..25e7c8344564 --- /dev/null +++ b/arch/ia64/kernel/palinfo.c @@ -0,0 +1,1023 @@ +/* + * palinfo.c + * + * Prints processor specific information reported by PAL. + * This code is based on specification of PAL as of the + * Intel IA-64 Architecture Software Developer's Manual v1.0. + * + * + * Copyright (C) 2000-2001, 2003 Hewlett-Packard Co + * Stephane Eranian <eranian@hpl.hp.com> + * Copyright (C) 2004 Intel Corporation + * Ashok Raj <ashok.raj@intel.com> + * + * 05/26/2000 S.Eranian initial release + * 08/21/2000 S.Eranian updated to July 2000 PAL specs + * 02/05/2001 S.Eranian fixed module support + * 10/23/2001 S.Eranian updated pal_perf_mon_info bug fixes + * 03/24/2004 Ashok Raj updated to work with CPU Hotplug + */ +#include <linux/config.h> +#include <linux/types.h> +#include <linux/errno.h> +#include <linux/init.h> +#include <linux/proc_fs.h> +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/efi.h> +#include <linux/notifier.h> +#include <linux/cpu.h> +#include <linux/cpumask.h> + +#include <asm/pal.h> +#include <asm/sal.h> +#include <asm/page.h> +#include <asm/processor.h> +#include <linux/smp.h> + +MODULE_AUTHOR("Stephane Eranian <eranian@hpl.hp.com>"); +MODULE_DESCRIPTION("/proc interface to IA-64 PAL"); +MODULE_LICENSE("GPL"); + +#define PALINFO_VERSION "0.5" + +typedef int (*palinfo_func_t)(char*); + +typedef struct { + const char *name; /* name of the proc entry */ + palinfo_func_t proc_read; /* function to call for reading */ + struct proc_dir_entry *entry; /* registered entry (removal) */ +} palinfo_entry_t; + + +/* + * A bunch of string array to get pretty printing + */ + +static char *cache_types[] = { + "", /* not used */ + "Instruction", + "Data", + "Data/Instruction" /* unified */ +}; + +static const char *cache_mattrib[]={ + "WriteThrough", + "WriteBack", + "", /* reserved */ + "" /* reserved */ +}; + +static const char *cache_st_hints[]={ + "Temporal, level 1", + "Reserved", + "Reserved", + "Non-temporal, all levels", + "Reserved", + "Reserved", + "Reserved", + "Reserved" +}; + +static const char *cache_ld_hints[]={ + "Temporal, level 1", + "Non-temporal, level 1", + "Reserved", + "Non-temporal, all levels", + "Reserved", + "Reserved", + "Reserved", + "Reserved" +}; + +static const char *rse_hints[]={ + "enforced lazy", + "eager stores", + "eager loads", + "eager loads and stores" +}; + +#define RSE_HINTS_COUNT ARRAY_SIZE(rse_hints) + +static const char *mem_attrib[]={ + "WB", /* 000 */ + "SW", /* 001 */ + "010", /* 010 */ + "011", /* 011 */ + "UC", /* 100 */ + "UCE", /* 101 */ + "WC", /* 110 */ + "NaTPage" /* 111 */ +}; + +/* + * Take a 64bit vector and produces a string such that + * if bit n is set then 2^n in clear text is generated. The adjustment + * to the right unit is also done. + * + * Input: + * - a pointer to a buffer to hold the string + * - a 64-bit vector + * Ouput: + * - a pointer to the end of the buffer + * + */ +static char * +bitvector_process(char *p, u64 vector) +{ + int i,j; + const char *units[]={ "", "K", "M", "G", "T" }; + + for (i=0, j=0; i < 64; i++ , j=i/10) { + if (vector & 0x1) { + p += sprintf(p, "%d%s ", 1 << (i-j*10), units[j]); + } + vector >>= 1; + } + return p; +} + +/* + * Take a 64bit vector and produces a string such that + * if bit n is set then register n is present. The function + * takes into account consecutive registers and prints out ranges. + * + * Input: + * - a pointer to a buffer to hold the string + * - a 64-bit vector + * Ouput: + * - a pointer to the end of the buffer + * + */ +static char * +bitregister_process(char *p, u64 *reg_info, int max) +{ + int i, begin, skip = 0; + u64 value = reg_info[0]; + + value >>= i = begin = ffs(value) - 1; + + for(; i < max; i++ ) { + + if (i != 0 && (i%64) == 0) value = *++reg_info; + + if ((value & 0x1) == 0 && skip == 0) { + if (begin <= i - 2) + p += sprintf(p, "%d-%d ", begin, i-1); + else + p += sprintf(p, "%d ", i-1); + skip = 1; + begin = -1; + } else if ((value & 0x1) && skip == 1) { + skip = 0; + begin = i; + } + value >>=1; + } + if (begin > -1) { + if (begin < 127) + p += sprintf(p, "%d-127", begin); + else + p += sprintf(p, "127"); + } + + return p; +} + +static int +power_info(char *page) +{ + s64 status; + char *p = page; + u64 halt_info_buffer[8]; + pal_power_mgmt_info_u_t *halt_info =(pal_power_mgmt_info_u_t *)halt_info_buffer; + int i; + + status = ia64_pal_halt_info(halt_info); + if (status != 0) return 0; + + for (i=0; i < 8 ; i++ ) { + if (halt_info[i].pal_power_mgmt_info_s.im == 1) { + p += sprintf(p, "Power level %d:\n" + "\tentry_latency : %d cycles\n" + "\texit_latency : %d cycles\n" + "\tpower consumption : %d mW\n" + "\tCache+TLB coherency : %s\n", i, + halt_info[i].pal_power_mgmt_info_s.entry_latency, + halt_info[i].pal_power_mgmt_info_s.exit_latency, + halt_info[i].pal_power_mgmt_info_s.power_consumption, + halt_info[i].pal_power_mgmt_info_s.co ? "Yes" : "No"); + } else { + p += sprintf(p,"Power level %d: not implemented\n",i); + } + } + return p - page; +} + +static int +cache_info(char *page) +{ + char *p = page; + u64 i, levels, unique_caches; + pal_cache_config_info_t cci; + int j, k; + s64 status; + + if ((status = ia64_pal_cache_summary(&levels, &unique_caches)) != 0) { + printk(KERN_ERR "ia64_pal_cache_summary=%ld\n", status); + return 0; + } + + p += sprintf(p, "Cache levels : %ld\nUnique caches : %ld\n\n", levels, unique_caches); + + for (i=0; i < levels; i++) { + + for (j=2; j >0 ; j--) { + + /* even without unification some level may not be present */ + if ((status=ia64_pal_cache_config_info(i,j, &cci)) != 0) { + continue; + } + p += sprintf(p, + "%s Cache level %lu:\n" + "\tSize : %lu bytes\n" + "\tAttributes : ", + cache_types[j+cci.pcci_unified], i+1, + cci.pcci_cache_size); + + if (cci.pcci_unified) p += sprintf(p, "Unified "); + + p += sprintf(p, "%s\n", cache_mattrib[cci.pcci_cache_attr]); + + p += sprintf(p, + "\tAssociativity : %d\n" + "\tLine size : %d bytes\n" + "\tStride : %d bytes\n", + cci.pcci_assoc, 1<<cci.pcci_line_size, 1<<cci.pcci_stride); + if (j == 1) + p += sprintf(p, "\tStore latency : N/A\n"); + else + p += sprintf(p, "\tStore latency : %d cycle(s)\n", + cci.pcci_st_latency); + + p += sprintf(p, + "\tLoad latency : %d cycle(s)\n" + "\tStore hints : ", cci.pcci_ld_latency); + + for(k=0; k < 8; k++ ) { + if ( cci.pcci_st_hints & 0x1) + p += sprintf(p, "[%s]", cache_st_hints[k]); + cci.pcci_st_hints >>=1; + } + p += sprintf(p, "\n\tLoad hints : "); + + for(k=0; k < 8; k++ ) { + if (cci.pcci_ld_hints & 0x1) + p += sprintf(p, "[%s]", cache_ld_hints[k]); + cci.pcci_ld_hints >>=1; + } + p += sprintf(p, + "\n\tAlias boundary : %d byte(s)\n" + "\tTag LSB : %d\n" + "\tTag MSB : %d\n", + 1<<cci.pcci_alias_boundary, cci.pcci_tag_lsb, + cci.pcci_tag_msb); + + /* when unified, data(j=2) is enough */ + if (cci.pcci_unified) break; + } + } + return p - page; +} + + +static int +vm_info(char *page) +{ + char *p = page; + u64 tr_pages =0, vw_pages=0, tc_pages; + u64 attrib; + pal_vm_info_1_u_t vm_info_1; + pal_vm_info_2_u_t vm_info_2; + pal_tc_info_u_t tc_info; + ia64_ptce_info_t ptce; + const char *sep; + int i, j; + s64 status; + + if ((status = ia64_pal_vm_summary(&vm_info_1, &vm_info_2)) !=0) { + printk(KERN_ERR "ia64_pal_vm_summary=%ld\n", status); + return 0; + } + + + p += sprintf(p, + "Physical Address Space : %d bits\n" + "Virtual Address Space : %d bits\n" + "Protection Key Registers(PKR) : %d\n" + "Implemented bits in PKR.key : %d\n" + "Hash Tag ID : 0x%x\n" + "Size of RR.rid : %d\n", + vm_info_1.pal_vm_info_1_s.phys_add_size, + vm_info_2.pal_vm_info_2_s.impl_va_msb+1, vm_info_1.pal_vm_info_1_s.max_pkr+1, + vm_info_1.pal_vm_info_1_s.key_size, vm_info_1.pal_vm_info_1_s.hash_tag_id, + vm_info_2.pal_vm_info_2_s.rid_size); + + if (ia64_pal_mem_attrib(&attrib) != 0) + return 0; + + p += sprintf(p, "Supported memory attributes : "); + sep = ""; + for (i = 0; i < 8; i++) { + if (attrib & (1 << i)) { + p += sprintf(p, "%s%s", sep, mem_attrib[i]); + sep = ", "; + } + } + p += sprintf(p, "\n"); + + if ((status = ia64_pal_vm_page_size(&tr_pages, &vw_pages)) !=0) { + printk(KERN_ERR "ia64_pal_vm_page_size=%ld\n", status); + return 0; + } + + p += sprintf(p, + "\nTLB walker : %simplemented\n" + "Number of DTR : %d\n" + "Number of ITR : %d\n" + "TLB insertable page sizes : ", + vm_info_1.pal_vm_info_1_s.vw ? "" : "not ", + vm_info_1.pal_vm_info_1_s.max_dtr_entry+1, + vm_info_1.pal_vm_info_1_s.max_itr_entry+1); + + + p = bitvector_process(p, tr_pages); + + p += sprintf(p, "\nTLB purgeable page sizes : "); + + p = bitvector_process(p, vw_pages); + + if ((status=ia64_get_ptce(&ptce)) != 0) { + printk(KERN_ERR "ia64_get_ptce=%ld\n", status); + return 0; + } + + p += sprintf(p, + "\nPurge base address : 0x%016lx\n" + "Purge outer loop count : %d\n" + "Purge inner loop count : %d\n" + "Purge outer loop stride : %d\n" + "Purge inner loop stride : %d\n", + ptce.base, ptce.count[0], ptce.count[1], ptce.stride[0], ptce.stride[1]); + + p += sprintf(p, + "TC Levels : %d\n" + "Unique TC(s) : %d\n", + vm_info_1.pal_vm_info_1_s.num_tc_levels, + vm_info_1.pal_vm_info_1_s.max_unique_tcs); + + for(i=0; i < vm_info_1.pal_vm_info_1_s.num_tc_levels; i++) { + for (j=2; j>0 ; j--) { + tc_pages = 0; /* just in case */ + + + /* even without unification, some levels may not be present */ + if ((status=ia64_pal_vm_info(i,j, &tc_info, &tc_pages)) != 0) { + continue; + } + + p += sprintf(p, + "\n%s Translation Cache Level %d:\n" + "\tHash sets : %d\n" + "\tAssociativity : %d\n" + "\tNumber of entries : %d\n" + "\tFlags : ", + cache_types[j+tc_info.tc_unified], i+1, tc_info.tc_num_sets, + tc_info.tc_associativity, tc_info.tc_num_entries); + + if (tc_info.tc_pf) p += sprintf(p, "PreferredPageSizeOptimized "); + if (tc_info.tc_unified) p += sprintf(p, "Unified "); + if (tc_info.tc_reduce_tr) p += sprintf(p, "TCReduction"); + + p += sprintf(p, "\n\tSupported page sizes: "); + + p = bitvector_process(p, tc_pages); + + /* when unified date (j=2) is enough */ + if (tc_info.tc_unified) break; + } + } + p += sprintf(p, "\n"); + + return p - page; +} + + +static int +register_info(char *page) +{ + char *p = page; + u64 reg_info[2]; + u64 info; + u64 phys_stacked; + pal_hints_u_t hints; + u64 iregs, dregs; + char *info_type[]={ + "Implemented AR(s)", + "AR(s) with read side-effects", + "Implemented CR(s)", + "CR(s) with read side-effects", + }; + + for(info=0; info < 4; info++) { + + if (ia64_pal_register_info(info, ®_info[0], ®_info[1]) != 0) return 0; + + p += sprintf(p, "%-32s : ", info_type[info]); + + p = bitregister_process(p, reg_info, 128); + + p += sprintf(p, "\n"); + } + + if (ia64_pal_rse_info(&phys_stacked, &hints) != 0) return 0; + + p += sprintf(p, + "RSE stacked physical registers : %ld\n" + "RSE load/store hints : %ld (%s)\n", + phys_stacked, hints.ph_data, + hints.ph_data < RSE_HINTS_COUNT ? rse_hints[hints.ph_data]: "(??)"); + + if (ia64_pal_debug_info(&iregs, &dregs)) + return 0; + + p += sprintf(p, + "Instruction debug register pairs : %ld\n" + "Data debug register pairs : %ld\n", iregs, dregs); + + return p - page; +} + +static const char *proc_features[]={ + NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL, + NULL,NULL,NULL,NULL,NULL,NULL,NULL, NULL,NULL, + NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL, + NULL,NULL,NULL,NULL,NULL, NULL,NULL,NULL,NULL, + NULL,NULL,NULL,NULL,NULL, + "XIP,XPSR,XFS implemented", + "XR1-XR3 implemented", + "Disable dynamic predicate prediction", + "Disable processor physical number", + "Disable dynamic data cache prefetch", + "Disable dynamic inst cache prefetch", + "Disable dynamic branch prediction", + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + "Disable BINIT on processor time-out", + "Disable dynamic power management (DPM)", + "Disable coherency", + "Disable cache", + "Enable CMCI promotion", + "Enable MCA to BINIT promotion", + "Enable MCA promotion", + "Enable BERR promotion" +}; + + +static int +processor_info(char *page) +{ + char *p = page; + const char **v = proc_features; + u64 avail=1, status=1, control=1; + int i; + s64 ret; + + if ((ret=ia64_pal_proc_get_features(&avail, &status, &control)) != 0) return 0; + + for(i=0; i < 64; i++, v++,avail >>=1, status >>=1, control >>=1) { + if ( ! *v ) continue; + p += sprintf(p, "%-40s : %s%s %s\n", *v, + avail & 0x1 ? "" : "NotImpl", + avail & 0x1 ? (status & 0x1 ? "On" : "Off"): "", + avail & 0x1 ? (control & 0x1 ? "Ctrl" : "NoCtrl"): ""); + } + return p - page; +} + +static const char *bus_features[]={ + NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL, + NULL,NULL,NULL,NULL,NULL,NULL,NULL, NULL,NULL, + NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL, + NULL,NULL, + "Request Bus Parking", + "Bus Lock Mask", + "Enable Half Transfer", + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, + "Enable Cache Line Repl. Shared", + "Enable Cache Line Repl. Exclusive", + "Disable Transaction Queuing", + "Disable Response Error Checking", + "Disable Bus Error Checking", + "Disable Bus Requester Internal Error Signalling", + "Disable Bus Requester Error Signalling", + "Disable Bus Initialization Event Checking", + "Disable Bus Initialization Event Signalling", + "Disable Bus Address Error Checking", + "Disable Bus Address Error Signalling", + "Disable Bus Data Error Checking" +}; + + +static int +bus_info(char *page) +{ + char *p = page; + const char **v = bus_features; + pal_bus_features_u_t av, st, ct; + u64 avail, status, control; + int i; + s64 ret; + + if ((ret=ia64_pal_bus_get_features(&av, &st, &ct)) != 0) return 0; + + avail = av.pal_bus_features_val; + status = st.pal_bus_features_val; + control = ct.pal_bus_features_val; + + for(i=0; i < 64; i++, v++, avail >>=1, status >>=1, control >>=1) { + if ( ! *v ) continue; + p += sprintf(p, "%-48s : %s%s %s\n", *v, + avail & 0x1 ? "" : "NotImpl", + avail & 0x1 ? (status & 0x1 ? "On" : "Off"): "", + avail & 0x1 ? (control & 0x1 ? "Ctrl" : "NoCtrl"): ""); + } + return p - page; +} + +static int +version_info(char *page) +{ + pal_version_u_t min_ver, cur_ver; + char *p = page; + + /* The PAL_VERSION call is advertised as being able to support + * both physical and virtual mode calls. This seems to be a documentation + * bug rather than firmware bug. In fact, it does only support physical mode. + * So now the code reflects this fact and the pal_version() has been updated + * accordingly. + */ + if (ia64_pal_version(&min_ver, &cur_ver) != 0) return 0; + + p += sprintf(p, + "PAL_vendor : 0x%02x (min=0x%02x)\n" + "PAL_A : %x.%x.%x (min=%x.%x.%x)\n" + "PAL_B : %x.%x.%x (min=%x.%x.%x)\n", + cur_ver.pal_version_s.pv_pal_vendor, min_ver.pal_version_s.pv_pal_vendor, + + cur_ver.pal_version_s.pv_pal_a_model>>4, + cur_ver.pal_version_s.pv_pal_a_model&0xf, cur_ver.pal_version_s.pv_pal_a_rev, + min_ver.pal_version_s.pv_pal_a_model>>4, + min_ver.pal_version_s.pv_pal_a_model&0xf, min_ver.pal_version_s.pv_pal_a_rev, + + cur_ver.pal_version_s.pv_pal_b_model>>4, + cur_ver.pal_version_s.pv_pal_b_model&0xf, cur_ver.pal_version_s.pv_pal_b_rev, + min_ver.pal_version_s.pv_pal_b_model>>4, + min_ver.pal_version_s.pv_pal_b_model&0xf, min_ver.pal_version_s.pv_pal_b_rev); + return p - page; +} + +static int +perfmon_info(char *page) +{ + char *p = page; + u64 pm_buffer[16]; + pal_perf_mon_info_u_t pm_info; + + if (ia64_pal_perf_mon_info(pm_buffer, &pm_info) != 0) return 0; + + p += sprintf(p, + "PMC/PMD pairs : %d\n" + "Counter width : %d bits\n" + "Cycle event number : %d\n" + "Retired event number : %d\n" + "Implemented PMC : ", + pm_info.pal_perf_mon_info_s.generic, pm_info.pal_perf_mon_info_s.width, + pm_info.pal_perf_mon_info_s.cycles, pm_info.pal_perf_mon_info_s.retired); + + p = bitregister_process(p, pm_buffer, 256); + p += sprintf(p, "\nImplemented PMD : "); + p = bitregister_process(p, pm_buffer+4, 256); + p += sprintf(p, "\nCycles count capable : "); + p = bitregister_process(p, pm_buffer+8, 256); + p += sprintf(p, "\nRetired bundles count capable : "); + +#ifdef CONFIG_ITANIUM + /* + * PAL_PERF_MON_INFO reports that only PMC4 can be used to count CPU_CYCLES + * which is wrong, both PMC4 and PMD5 support it. + */ + if (pm_buffer[12] == 0x10) pm_buffer[12]=0x30; +#endif + + p = bitregister_process(p, pm_buffer+12, 256); + + p += sprintf(p, "\n"); + + return p - page; +} + +static int +frequency_info(char *page) +{ + char *p = page; + struct pal_freq_ratio proc, itc, bus; + u64 base; + + if (ia64_pal_freq_base(&base) == -1) + p += sprintf(p, "Output clock : not implemented\n"); + else + p += sprintf(p, "Output clock : %ld ticks/s\n", base); + + if (ia64_pal_freq_ratios(&proc, &bus, &itc) != 0) return 0; + + p += sprintf(p, + "Processor/Clock ratio : %ld/%ld\n" + "Bus/Clock ratio : %ld/%ld\n" + "ITC/Clock ratio : %ld/%ld\n", + proc.num, proc.den, bus.num, bus.den, itc.num, itc.den); + + return p - page; +} + +static int +tr_info(char *page) +{ + char *p = page; + s64 status; + pal_tr_valid_u_t tr_valid; + u64 tr_buffer[4]; + pal_vm_info_1_u_t vm_info_1; + pal_vm_info_2_u_t vm_info_2; + u64 i, j; + u64 max[3], pgm; + struct ifa_reg { + u64 valid:1; + u64 ig:11; + u64 vpn:52; + } *ifa_reg; + struct itir_reg { + u64 rv1:2; + u64 ps:6; + u64 key:24; + u64 rv2:32; + } *itir_reg; + struct gr_reg { + u64 p:1; + u64 rv1:1; + u64 ma:3; + u64 a:1; + u64 d:1; + u64 pl:2; + u64 ar:3; + u64 ppn:38; + u64 rv2:2; + u64 ed:1; + u64 ig:11; + } *gr_reg; + struct rid_reg { + u64 ig1:1; + u64 rv1:1; + u64 ig2:6; + u64 rid:24; + u64 rv2:32; + } *rid_reg; + + if ((status = ia64_pal_vm_summary(&vm_info_1, &vm_info_2)) !=0) { + printk(KERN_ERR "ia64_pal_vm_summary=%ld\n", status); + return 0; + } + max[0] = vm_info_1.pal_vm_info_1_s.max_itr_entry+1; + max[1] = vm_info_1.pal_vm_info_1_s.max_dtr_entry+1; + + for (i=0; i < 2; i++ ) { + for (j=0; j < max[i]; j++) { + + status = ia64_pal_tr_read(j, i, tr_buffer, &tr_valid); + if (status != 0) { + printk(KERN_ERR "palinfo: pal call failed on tr[%lu:%lu]=%ld\n", + i, j, status); + continue; + } + + ifa_reg = (struct ifa_reg *)&tr_buffer[2]; + + if (ifa_reg->valid == 0) continue; + + gr_reg = (struct gr_reg *)tr_buffer; + itir_reg = (struct itir_reg *)&tr_buffer[1]; + rid_reg = (struct rid_reg *)&tr_buffer[3]; + + pgm = -1 << (itir_reg->ps - 12); + p += sprintf(p, + "%cTR%lu: av=%d pv=%d dv=%d mv=%d\n" + "\tppn : 0x%lx\n" + "\tvpn : 0x%lx\n" + "\tps : ", + "ID"[i], j, + tr_valid.pal_tr_valid_s.access_rights_valid, + tr_valid.pal_tr_valid_s.priv_level_valid, + tr_valid.pal_tr_valid_s.dirty_bit_valid, + tr_valid.pal_tr_valid_s.mem_attr_valid, + (gr_reg->ppn & pgm)<< 12, (ifa_reg->vpn & pgm)<< 12); + + p = bitvector_process(p, 1<< itir_reg->ps); + + p += sprintf(p, + "\n\tpl : %d\n" + "\tar : %d\n" + "\trid : %x\n" + "\tp : %d\n" + "\tma : %d\n" + "\td : %d\n", + gr_reg->pl, gr_reg->ar, rid_reg->rid, gr_reg->p, gr_reg->ma, + gr_reg->d); + } + } + return p - page; +} + + + +/* + * List {name,function} pairs for every entry in /proc/palinfo/cpu* + */ +static palinfo_entry_t palinfo_entries[]={ + { "version_info", version_info, }, + { "vm_info", vm_info, }, + { "cache_info", cache_info, }, + { "power_info", power_info, }, + { "register_info", register_info, }, + { "processor_info", processor_info, }, + { "perfmon_info", perfmon_info, }, + { "frequency_info", frequency_info, }, + { "bus_info", bus_info }, + { "tr_info", tr_info, } +}; + +#define NR_PALINFO_ENTRIES (int) ARRAY_SIZE(palinfo_entries) + +/* + * this array is used to keep track of the proc entries we create. This is + * required in the module mode when we need to remove all entries. The procfs code + * does not do recursion of deletion + * + * Notes: + * - +1 accounts for the cpuN directory entry in /proc/pal + */ +#define NR_PALINFO_PROC_ENTRIES (NR_CPUS*(NR_PALINFO_ENTRIES+1)) + +static struct proc_dir_entry *palinfo_proc_entries[NR_PALINFO_PROC_ENTRIES]; +static struct proc_dir_entry *palinfo_dir; + +/* + * This data structure is used to pass which cpu,function is being requested + * It must fit in a 64bit quantity to be passed to the proc callback routine + * + * In SMP mode, when we get a request for another CPU, we must call that + * other CPU using IPI and wait for the result before returning. + */ +typedef union { + u64 value; + struct { + unsigned req_cpu: 32; /* for which CPU this info is */ + unsigned func_id: 32; /* which function is requested */ + } pal_func_cpu; +} pal_func_cpu_u_t; + +#define req_cpu pal_func_cpu.req_cpu +#define func_id pal_func_cpu.func_id + +#ifdef CONFIG_SMP + +/* + * used to hold information about final function to call + */ +typedef struct { + palinfo_func_t func; /* pointer to function to call */ + char *page; /* buffer to store results */ + int ret; /* return value from call */ +} palinfo_smp_data_t; + + +/* + * this function does the actual final call and he called + * from the smp code, i.e., this is the palinfo callback routine + */ +static void +palinfo_smp_call(void *info) +{ + palinfo_smp_data_t *data = (palinfo_smp_data_t *)info; + if (data == NULL) { + printk(KERN_ERR "palinfo: data pointer is NULL\n"); + data->ret = 0; /* no output */ + return; + } + /* does this actual call */ + data->ret = (*data->func)(data->page); +} + +/* + * function called to trigger the IPI, we need to access a remote CPU + * Return: + * 0 : error or nothing to output + * otherwise how many bytes in the "page" buffer were written + */ +static +int palinfo_handle_smp(pal_func_cpu_u_t *f, char *page) +{ + palinfo_smp_data_t ptr; + int ret; + + ptr.func = palinfo_entries[f->func_id].proc_read; + ptr.page = page; + ptr.ret = 0; /* just in case */ + + + /* will send IPI to other CPU and wait for completion of remote call */ + if ((ret=smp_call_function_single(f->req_cpu, palinfo_smp_call, &ptr, 0, 1))) { + printk(KERN_ERR "palinfo: remote CPU call from %d to %d on function %d: " + "error %d\n", smp_processor_id(), f->req_cpu, f->func_id, ret); + return 0; + } + return ptr.ret; +} +#else /* ! CONFIG_SMP */ +static +int palinfo_handle_smp(pal_func_cpu_u_t *f, char *page) +{ + printk(KERN_ERR "palinfo: should not be called with non SMP kernel\n"); + return 0; +} +#endif /* CONFIG_SMP */ + +/* + * Entry point routine: all calls go through this function + */ +static int +palinfo_read_entry(char *page, char **start, off_t off, int count, int *eof, void *data) +{ + int len=0; + pal_func_cpu_u_t *f = (pal_func_cpu_u_t *)&data; + + /* + * in SMP mode, we may need to call another CPU to get correct + * information. PAL, by definition, is processor specific + */ + if (f->req_cpu == get_cpu()) + len = (*palinfo_entries[f->func_id].proc_read)(page); + else + len = palinfo_handle_smp(f, page); + + put_cpu(); + + if (len <= off+count) *eof = 1; + + *start = page + off; + len -= off; + + if (len>count) len = count; + if (len<0) len = 0; + + return len; +} + +static void +create_palinfo_proc_entries(unsigned int cpu) +{ +# define CPUSTR "cpu%d" + + pal_func_cpu_u_t f; + struct proc_dir_entry **pdir; + struct proc_dir_entry *cpu_dir; + int j; + char cpustr[sizeof(CPUSTR)]; + + + /* + * we keep track of created entries in a depth-first order for + * cleanup purposes. Each entry is stored into palinfo_proc_entries + */ + sprintf(cpustr,CPUSTR, cpu); + + cpu_dir = proc_mkdir(cpustr, palinfo_dir); + + f.req_cpu = cpu; + + /* + * Compute the location to store per cpu entries + * We dont store the top level entry in this list, but + * remove it finally after removing all cpu entries. + */ + pdir = &palinfo_proc_entries[cpu*(NR_PALINFO_ENTRIES+1)]; + *pdir++ = cpu_dir; + for (j=0; j < NR_PALINFO_ENTRIES; j++) { + f.func_id = j; + *pdir = create_proc_read_entry( + palinfo_entries[j].name, 0, cpu_dir, + palinfo_read_entry, (void *)f.value); + if (*pdir) + (*pdir)->owner = THIS_MODULE; + pdir++; + } +} + +static void +remove_palinfo_proc_entries(unsigned int hcpu) +{ + int j; + struct proc_dir_entry *cpu_dir, **pdir; + + pdir = &palinfo_proc_entries[hcpu*(NR_PALINFO_ENTRIES+1)]; + cpu_dir = *pdir; + *pdir++=NULL; + for (j=0; j < (NR_PALINFO_ENTRIES); j++) { + if ((*pdir)) { + remove_proc_entry ((*pdir)->name, cpu_dir); + *pdir ++= NULL; + } + } + + if (cpu_dir) { + remove_proc_entry(cpu_dir->name, palinfo_dir); + } +} + +static int __devinit palinfo_cpu_callback(struct notifier_block *nfb, + unsigned long action, + void *hcpu) +{ + unsigned int hotcpu = (unsigned long)hcpu; + + switch (action) { + case CPU_ONLINE: + create_palinfo_proc_entries(hotcpu); + break; +#ifdef CONFIG_HOTPLUG_CPU + case CPU_DEAD: + remove_palinfo_proc_entries(hotcpu); + break; +#endif + } + return NOTIFY_OK; +} + +static struct notifier_block palinfo_cpu_notifier = +{ + .notifier_call = palinfo_cpu_callback, + .priority = 0, +}; + +static int __init +palinfo_init(void) +{ + int i = 0; + + printk(KERN_INFO "PAL Information Facility v%s\n", PALINFO_VERSION); + palinfo_dir = proc_mkdir("pal", NULL); + + /* Create palinfo dirs in /proc for all online cpus */ + for_each_online_cpu(i) { + create_palinfo_proc_entries(i); + } + + /* Register for future delivery via notify registration */ + register_cpu_notifier(&palinfo_cpu_notifier); + + return 0; +} + +static void __exit +palinfo_exit(void) +{ + int i = 0; + + /* remove all nodes: depth first pass. Could optimize this */ + for_each_online_cpu(i) { + remove_palinfo_proc_entries(i); + } + + /* + * Remove the top level entry finally + */ + remove_proc_entry(palinfo_dir->name, NULL); + + /* + * Unregister from cpu notifier callbacks + */ + unregister_cpu_notifier(&palinfo_cpu_notifier); +} + +module_init(palinfo_init); +module_exit(palinfo_exit); diff --git a/arch/ia64/kernel/patch.c b/arch/ia64/kernel/patch.c new file mode 100644 index 000000000000..367804a605fa --- /dev/null +++ b/arch/ia64/kernel/patch.c @@ -0,0 +1,189 @@ +/* + * Instruction-patching support. + * + * Copyright (C) 2003 Hewlett-Packard Co + * David Mosberger-Tang <davidm@hpl.hp.com> + */ +#include <linux/init.h> +#include <linux/string.h> + +#include <asm/patch.h> +#include <asm/processor.h> +#include <asm/sections.h> +#include <asm/system.h> +#include <asm/unistd.h> + +/* + * This was adapted from code written by Tony Luck: + * + * The 64-bit value in a "movl reg=value" is scattered between the two words of the bundle + * like this: + * + * 6 6 5 4 3 2 1 + * 3210987654321098765432109876543210987654321098765432109876543210 + * ABBBBBBBBBBBBBBBBBBBBBBBCCCCCCCCCCCCCCCCCCDEEEEEFFFFFFFFFGGGGGGG + * + * CCCCCCCCCCCCCCCCCCxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx + * xxxxAFFFFFFFFFEEEEEDxGGGGGGGxxxxxxxxxxxxxBBBBBBBBBBBBBBBBBBBBBBB + */ +static u64 +get_imm64 (u64 insn_addr) +{ + u64 *p = (u64 *) (insn_addr & -16); /* mask out slot number */ + + return ( (p[1] & 0x0800000000000000UL) << 4) | /*A*/ + ((p[1] & 0x00000000007fffffUL) << 40) | /*B*/ + ((p[0] & 0xffffc00000000000UL) >> 24) | /*C*/ + ((p[1] & 0x0000100000000000UL) >> 23) | /*D*/ + ((p[1] & 0x0003e00000000000UL) >> 29) | /*E*/ + ((p[1] & 0x07fc000000000000UL) >> 43) | /*F*/ + ((p[1] & 0x000007f000000000UL) >> 36); /*G*/ +} + +/* Patch instruction with "val" where "mask" has 1 bits. */ +void +ia64_patch (u64 insn_addr, u64 mask, u64 val) +{ + u64 m0, m1, v0, v1, b0, b1, *b = (u64 *) (insn_addr & -16); +# define insn_mask ((1UL << 41) - 1) + unsigned long shift; + + b0 = b[0]; b1 = b[1]; + shift = 5 + 41 * (insn_addr % 16); /* 5 bits of template, then 3 x 41-bit instructions */ + if (shift >= 64) { + m1 = mask << (shift - 64); + v1 = val << (shift - 64); + } else { + m0 = mask << shift; m1 = mask >> (64 - shift); + v0 = val << shift; v1 = val >> (64 - shift); + b[0] = (b0 & ~m0) | (v0 & m0); + } + b[1] = (b1 & ~m1) | (v1 & m1); +} + +void +ia64_patch_imm64 (u64 insn_addr, u64 val) +{ + ia64_patch(insn_addr, + 0x01fffefe000UL, ( ((val & 0x8000000000000000UL) >> 27) /* bit 63 -> 36 */ + | ((val & 0x0000000000200000UL) << 0) /* bit 21 -> 21 */ + | ((val & 0x00000000001f0000UL) << 6) /* bit 16 -> 22 */ + | ((val & 0x000000000000ff80UL) << 20) /* bit 7 -> 27 */ + | ((val & 0x000000000000007fUL) << 13) /* bit 0 -> 13 */)); + ia64_patch(insn_addr - 1, 0x1ffffffffffUL, val >> 22); +} + +void +ia64_patch_imm60 (u64 insn_addr, u64 val) +{ + ia64_patch(insn_addr, + 0x011ffffe000UL, ( ((val & 0x0800000000000000UL) >> 23) /* bit 59 -> 36 */ + | ((val & 0x00000000000fffffUL) << 13) /* bit 0 -> 13 */)); + ia64_patch(insn_addr - 1, 0x1fffffffffcUL, val >> 18); +} + +/* + * We need sometimes to load the physical address of a kernel + * object. Often we can convert the virtual address to physical + * at execution time, but sometimes (either for performance reasons + * or during error recovery) we cannot to this. Patch the marked + * bundles to load the physical address. + */ +void __init +ia64_patch_vtop (unsigned long start, unsigned long end) +{ + s32 *offp = (s32 *) start; + u64 ip; + + while (offp < (s32 *) end) { + ip = (u64) offp + *offp; + + /* replace virtual address with corresponding physical address: */ + ia64_patch_imm64(ip, ia64_tpa(get_imm64(ip))); + ia64_fc((void *) ip); + ++offp; + } + ia64_sync_i(); + ia64_srlz_i(); +} + +void +ia64_patch_mckinley_e9 (unsigned long start, unsigned long end) +{ + static int first_time = 1; + int need_workaround; + s32 *offp = (s32 *) start; + u64 *wp; + + need_workaround = (local_cpu_data->family == 0x1f && local_cpu_data->model == 0); + + if (first_time) { + first_time = 0; + if (need_workaround) + printk(KERN_INFO "Leaving McKinley Errata 9 workaround enabled\n"); + else + printk(KERN_INFO "McKinley Errata 9 workaround not needed; " + "disabling it\n"); + } + if (need_workaround) + return; + + while (offp < (s32 *) end) { + wp = (u64 *) ia64_imva((char *) offp + *offp); + wp[0] = 0x0000000100000000UL; /* nop.m 0; nop.i 0; nop.i 0 */ + wp[1] = 0x0004000000000200UL; + wp[2] = 0x0000000100000011UL; /* nop.m 0; nop.i 0; br.ret.sptk.many b6 */ + wp[3] = 0x0084006880000200UL; + ia64_fc(wp); ia64_fc(wp + 2); + ++offp; + } + ia64_sync_i(); + ia64_srlz_i(); +} + +static void +patch_fsyscall_table (unsigned long start, unsigned long end) +{ + extern unsigned long fsyscall_table[NR_syscalls]; + s32 *offp = (s32 *) start; + u64 ip; + + while (offp < (s32 *) end) { + ip = (u64) ia64_imva((char *) offp + *offp); + ia64_patch_imm64(ip, (u64) fsyscall_table); + ia64_fc((void *) ip); + ++offp; + } + ia64_sync_i(); + ia64_srlz_i(); +} + +static void +patch_brl_fsys_bubble_down (unsigned long start, unsigned long end) +{ + extern char fsys_bubble_down[]; + s32 *offp = (s32 *) start; + u64 ip; + + while (offp < (s32 *) end) { + ip = (u64) offp + *offp; + ia64_patch_imm60((u64) ia64_imva((void *) ip), + (u64) (fsys_bubble_down - (ip & -16)) / 16); + ia64_fc((void *) ip); + ++offp; + } + ia64_sync_i(); + ia64_srlz_i(); +} + +void +ia64_patch_gate (void) +{ +# define START(name) ((unsigned long) __start_gate_##name##_patchlist) +# define END(name) ((unsigned long)__end_gate_##name##_patchlist) + + patch_fsyscall_table(START(fsyscall), END(fsyscall)); + patch_brl_fsys_bubble_down(START(brl_fsys_bubble_down), END(brl_fsys_bubble_down)); + ia64_patch_vtop(START(vtop), END(vtop)); + ia64_patch_mckinley_e9(START(mckinley_e9), END(mckinley_e9)); +} diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c new file mode 100644 index 000000000000..71147be3279c --- /dev/null +++ b/arch/ia64/kernel/perfmon.c @@ -0,0 +1,6676 @@ +/* + * This file implements the perfmon-2 subsystem which is used + * to program the IA-64 Performance Monitoring Unit (PMU). + * + * The initial version of perfmon.c was written by + * Ganesh Venkitachalam, IBM Corp. + * + * Then it was modified for perfmon-1.x by Stephane Eranian and + * David Mosberger, Hewlett Packard Co. + * + * Version Perfmon-2.x is a rewrite of perfmon-1.x + * by Stephane Eranian, Hewlett Packard Co. + * + * Copyright (C) 1999-2003, 2005 Hewlett Packard Co + * Stephane Eranian <eranian@hpl.hp.com> + * David Mosberger-Tang <davidm@hpl.hp.com> + * + * More information about perfmon available at: + * http://www.hpl.hp.com/research/linux/perfmon + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/interrupt.h> +#include <linux/smp_lock.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <linux/init.h> +#include <linux/vmalloc.h> +#include <linux/mm.h> +#include <linux/sysctl.h> +#include <linux/list.h> +#include <linux/file.h> +#include <linux/poll.h> +#include <linux/vfs.h> +#include <linux/pagemap.h> +#include <linux/mount.h> +#include <linux/version.h> +#include <linux/bitops.h> + +#include <asm/errno.h> +#include <asm/intrinsics.h> +#include <asm/page.h> +#include <asm/perfmon.h> +#include <asm/processor.h> +#include <asm/signal.h> +#include <asm/system.h> +#include <asm/uaccess.h> +#include <asm/delay.h> + +#ifdef CONFIG_PERFMON +/* + * perfmon context state + */ +#define PFM_CTX_UNLOADED 1 /* context is not loaded onto any task */ +#define PFM_CTX_LOADED 2 /* context is loaded onto a task */ +#define PFM_CTX_MASKED 3 /* context is loaded but monitoring is masked due to overflow */ +#define PFM_CTX_ZOMBIE 4 /* owner of the context is closing it */ + +#define PFM_INVALID_ACTIVATION (~0UL) + +/* + * depth of message queue + */ +#define PFM_MAX_MSGS 32 +#define PFM_CTXQ_EMPTY(g) ((g)->ctx_msgq_head == (g)->ctx_msgq_tail) + +/* + * type of a PMU register (bitmask). + * bitmask structure: + * bit0 : register implemented + * bit1 : end marker + * bit2-3 : reserved + * bit4 : pmc has pmc.pm + * bit5 : pmc controls a counter (has pmc.oi), pmd is used as counter + * bit6-7 : register type + * bit8-31: reserved + */ +#define PFM_REG_NOTIMPL 0x0 /* not implemented at all */ +#define PFM_REG_IMPL 0x1 /* register implemented */ +#define PFM_REG_END 0x2 /* end marker */ +#define PFM_REG_MONITOR (0x1<<4|PFM_REG_IMPL) /* a PMC with a pmc.pm field only */ +#define PFM_REG_COUNTING (0x2<<4|PFM_REG_MONITOR) /* a monitor + pmc.oi+ PMD used as a counter */ +#define PFM_REG_CONTROL (0x4<<4|PFM_REG_IMPL) /* PMU control register */ +#define PFM_REG_CONFIG (0x8<<4|PFM_REG_IMPL) /* configuration register */ +#define PFM_REG_BUFFER (0xc<<4|PFM_REG_IMPL) /* PMD used as buffer */ + +#define PMC_IS_LAST(i) (pmu_conf->pmc_desc[i].type & PFM_REG_END) +#define PMD_IS_LAST(i) (pmu_conf->pmd_desc[i].type & PFM_REG_END) + +#define PMC_OVFL_NOTIFY(ctx, i) ((ctx)->ctx_pmds[i].flags & PFM_REGFL_OVFL_NOTIFY) + +/* i assumed unsigned */ +#define PMC_IS_IMPL(i) (i< PMU_MAX_PMCS && (pmu_conf->pmc_desc[i].type & PFM_REG_IMPL)) +#define PMD_IS_IMPL(i) (i< PMU_MAX_PMDS && (pmu_conf->pmd_desc[i].type & PFM_REG_IMPL)) + +/* XXX: these assume that register i is implemented */ +#define PMD_IS_COUNTING(i) ((pmu_conf->pmd_desc[i].type & PFM_REG_COUNTING) == PFM_REG_COUNTING) +#define PMC_IS_COUNTING(i) ((pmu_conf->pmc_desc[i].type & PFM_REG_COUNTING) == PFM_REG_COUNTING) +#define PMC_IS_MONITOR(i) ((pmu_conf->pmc_desc[i].type & PFM_REG_MONITOR) == PFM_REG_MONITOR) +#define PMC_IS_CONTROL(i) ((pmu_conf->pmc_desc[i].type & PFM_REG_CONTROL) == PFM_REG_CONTROL) + +#define PMC_DFL_VAL(i) pmu_conf->pmc_desc[i].default_value +#define PMC_RSVD_MASK(i) pmu_conf->pmc_desc[i].reserved_mask +#define PMD_PMD_DEP(i) pmu_conf->pmd_desc[i].dep_pmd[0] +#define PMC_PMD_DEP(i) pmu_conf->pmc_desc[i].dep_pmd[0] + +#define PFM_NUM_IBRS IA64_NUM_DBG_REGS +#define PFM_NUM_DBRS IA64_NUM_DBG_REGS + +#define CTX_OVFL_NOBLOCK(c) ((c)->ctx_fl_block == 0) +#define CTX_HAS_SMPL(c) ((c)->ctx_fl_is_sampling) +#define PFM_CTX_TASK(h) (h)->ctx_task + +#define PMU_PMC_OI 5 /* position of pmc.oi bit */ + +/* XXX: does not support more than 64 PMDs */ +#define CTX_USED_PMD(ctx, mask) (ctx)->ctx_used_pmds[0] |= (mask) +#define CTX_IS_USED_PMD(ctx, c) (((ctx)->ctx_used_pmds[0] & (1UL << (c))) != 0UL) + +#define CTX_USED_MONITOR(ctx, mask) (ctx)->ctx_used_monitors[0] |= (mask) + +#define CTX_USED_IBR(ctx,n) (ctx)->ctx_used_ibrs[(n)>>6] |= 1UL<< ((n) % 64) +#define CTX_USED_DBR(ctx,n) (ctx)->ctx_used_dbrs[(n)>>6] |= 1UL<< ((n) % 64) +#define CTX_USES_DBREGS(ctx) (((pfm_context_t *)(ctx))->ctx_fl_using_dbreg==1) +#define PFM_CODE_RR 0 /* requesting code range restriction */ +#define PFM_DATA_RR 1 /* requestion data range restriction */ + +#define PFM_CPUINFO_CLEAR(v) pfm_get_cpu_var(pfm_syst_info) &= ~(v) +#define PFM_CPUINFO_SET(v) pfm_get_cpu_var(pfm_syst_info) |= (v) +#define PFM_CPUINFO_GET() pfm_get_cpu_var(pfm_syst_info) + +#define RDEP(x) (1UL<<(x)) + +/* + * context protection macros + * in SMP: + * - we need to protect against CPU concurrency (spin_lock) + * - we need to protect against PMU overflow interrupts (local_irq_disable) + * in UP: + * - we need to protect against PMU overflow interrupts (local_irq_disable) + * + * spin_lock_irqsave()/spin_lock_irqrestore(): + * in SMP: local_irq_disable + spin_lock + * in UP : local_irq_disable + * + * spin_lock()/spin_lock(): + * in UP : removed automatically + * in SMP: protect against context accesses from other CPU. interrupts + * are not masked. This is useful for the PMU interrupt handler + * because we know we will not get PMU concurrency in that code. + */ +#define PROTECT_CTX(c, f) \ + do { \ + DPRINT(("spinlock_irq_save ctx %p by [%d]\n", c, current->pid)); \ + spin_lock_irqsave(&(c)->ctx_lock, f); \ + DPRINT(("spinlocked ctx %p by [%d]\n", c, current->pid)); \ + } while(0) + +#define UNPROTECT_CTX(c, f) \ + do { \ + DPRINT(("spinlock_irq_restore ctx %p by [%d]\n", c, current->pid)); \ + spin_unlock_irqrestore(&(c)->ctx_lock, f); \ + } while(0) + +#define PROTECT_CTX_NOPRINT(c, f) \ + do { \ + spin_lock_irqsave(&(c)->ctx_lock, f); \ + } while(0) + + +#define UNPROTECT_CTX_NOPRINT(c, f) \ + do { \ + spin_unlock_irqrestore(&(c)->ctx_lock, f); \ + } while(0) + + +#define PROTECT_CTX_NOIRQ(c) \ + do { \ + spin_lock(&(c)->ctx_lock); \ + } while(0) + +#define UNPROTECT_CTX_NOIRQ(c) \ + do { \ + spin_unlock(&(c)->ctx_lock); \ + } while(0) + + +#ifdef CONFIG_SMP + +#define GET_ACTIVATION() pfm_get_cpu_var(pmu_activation_number) +#define INC_ACTIVATION() pfm_get_cpu_var(pmu_activation_number)++ +#define SET_ACTIVATION(c) (c)->ctx_last_activation = GET_ACTIVATION() + +#else /* !CONFIG_SMP */ +#define SET_ACTIVATION(t) do {} while(0) +#define GET_ACTIVATION(t) do {} while(0) +#define INC_ACTIVATION(t) do {} while(0) +#endif /* CONFIG_SMP */ + +#define SET_PMU_OWNER(t, c) do { pfm_get_cpu_var(pmu_owner) = (t); pfm_get_cpu_var(pmu_ctx) = (c); } while(0) +#define GET_PMU_OWNER() pfm_get_cpu_var(pmu_owner) +#define GET_PMU_CTX() pfm_get_cpu_var(pmu_ctx) + +#define LOCK_PFS(g) spin_lock_irqsave(&pfm_sessions.pfs_lock, g) +#define UNLOCK_PFS(g) spin_unlock_irqrestore(&pfm_sessions.pfs_lock, g) + +#define PFM_REG_RETFLAG_SET(flags, val) do { flags &= ~PFM_REG_RETFL_MASK; flags |= (val); } while(0) + +/* + * cmp0 must be the value of pmc0 + */ +#define PMC0_HAS_OVFL(cmp0) (cmp0 & ~0x1UL) + +#define PFMFS_MAGIC 0xa0b4d889 + +/* + * debugging + */ +#define PFM_DEBUGGING 1 +#ifdef PFM_DEBUGGING +#define DPRINT(a) \ + do { \ + if (unlikely(pfm_sysctl.debug >0)) { printk("%s.%d: CPU%d [%d] ", __FUNCTION__, __LINE__, smp_processor_id(), current->pid); printk a; } \ + } while (0) + +#define DPRINT_ovfl(a) \ + do { \ + if (unlikely(pfm_sysctl.debug > 0 && pfm_sysctl.debug_ovfl >0)) { printk("%s.%d: CPU%d [%d] ", __FUNCTION__, __LINE__, smp_processor_id(), current->pid); printk a; } \ + } while (0) +#endif + +/* + * 64-bit software counter structure + * + * the next_reset_type is applied to the next call to pfm_reset_regs() + */ +typedef struct { + unsigned long val; /* virtual 64bit counter value */ + unsigned long lval; /* last reset value */ + unsigned long long_reset; /* reset value on sampling overflow */ + unsigned long short_reset; /* reset value on overflow */ + unsigned long reset_pmds[4]; /* which other pmds to reset when this counter overflows */ + unsigned long smpl_pmds[4]; /* which pmds are accessed when counter overflow */ + unsigned long seed; /* seed for random-number generator */ + unsigned long mask; /* mask for random-number generator */ + unsigned int flags; /* notify/do not notify */ + unsigned long eventid; /* overflow event identifier */ +} pfm_counter_t; + +/* + * context flags + */ +typedef struct { + unsigned int block:1; /* when 1, task will blocked on user notifications */ + unsigned int system:1; /* do system wide monitoring */ + unsigned int using_dbreg:1; /* using range restrictions (debug registers) */ + unsigned int is_sampling:1; /* true if using a custom format */ + unsigned int excl_idle:1; /* exclude idle task in system wide session */ + unsigned int going_zombie:1; /* context is zombie (MASKED+blocking) */ + unsigned int trap_reason:2; /* reason for going into pfm_handle_work() */ + unsigned int no_msg:1; /* no message sent on overflow */ + unsigned int can_restart:1; /* allowed to issue a PFM_RESTART */ + unsigned int reserved:22; +} pfm_context_flags_t; + +#define PFM_TRAP_REASON_NONE 0x0 /* default value */ +#define PFM_TRAP_REASON_BLOCK 0x1 /* we need to block on overflow */ +#define PFM_TRAP_REASON_RESET 0x2 /* we need to reset PMDs */ + + +/* + * perfmon context: encapsulates all the state of a monitoring session + */ + +typedef struct pfm_context { + spinlock_t ctx_lock; /* context protection */ + + pfm_context_flags_t ctx_flags; /* bitmask of flags (block reason incl.) */ + unsigned int ctx_state; /* state: active/inactive (no bitfield) */ + + struct task_struct *ctx_task; /* task to which context is attached */ + + unsigned long ctx_ovfl_regs[4]; /* which registers overflowed (notification) */ + + struct semaphore ctx_restart_sem; /* use for blocking notification mode */ + + unsigned long ctx_used_pmds[4]; /* bitmask of PMD used */ + unsigned long ctx_all_pmds[4]; /* bitmask of all accessible PMDs */ + unsigned long ctx_reload_pmds[4]; /* bitmask of force reload PMD on ctxsw in */ + + unsigned long ctx_all_pmcs[4]; /* bitmask of all accessible PMCs */ + unsigned long ctx_reload_pmcs[4]; /* bitmask of force reload PMC on ctxsw in */ + unsigned long ctx_used_monitors[4]; /* bitmask of monitor PMC being used */ + + unsigned long ctx_pmcs[IA64_NUM_PMC_REGS]; /* saved copies of PMC values */ + + unsigned int ctx_used_ibrs[1]; /* bitmask of used IBR (speedup ctxsw in) */ + unsigned int ctx_used_dbrs[1]; /* bitmask of used DBR (speedup ctxsw in) */ + unsigned long ctx_dbrs[IA64_NUM_DBG_REGS]; /* DBR values (cache) when not loaded */ + unsigned long ctx_ibrs[IA64_NUM_DBG_REGS]; /* IBR values (cache) when not loaded */ + + pfm_counter_t ctx_pmds[IA64_NUM_PMD_REGS]; /* software state for PMDS */ + + u64 ctx_saved_psr_up; /* only contains psr.up value */ + + unsigned long ctx_last_activation; /* context last activation number for last_cpu */ + unsigned int ctx_last_cpu; /* CPU id of current or last CPU used (SMP only) */ + unsigned int ctx_cpu; /* cpu to which perfmon is applied (system wide) */ + + int ctx_fd; /* file descriptor used my this context */ + pfm_ovfl_arg_t ctx_ovfl_arg; /* argument to custom buffer format handler */ + + pfm_buffer_fmt_t *ctx_buf_fmt; /* buffer format callbacks */ + void *ctx_smpl_hdr; /* points to sampling buffer header kernel vaddr */ + unsigned long ctx_smpl_size; /* size of sampling buffer */ + void *ctx_smpl_vaddr; /* user level virtual address of smpl buffer */ + + wait_queue_head_t ctx_msgq_wait; + pfm_msg_t ctx_msgq[PFM_MAX_MSGS]; + int ctx_msgq_head; + int ctx_msgq_tail; + struct fasync_struct *ctx_async_queue; + + wait_queue_head_t ctx_zombieq; /* termination cleanup wait queue */ +} pfm_context_t; + +/* + * magic number used to verify that structure is really + * a perfmon context + */ +#define PFM_IS_FILE(f) ((f)->f_op == &pfm_file_ops) + +#define PFM_GET_CTX(t) ((pfm_context_t *)(t)->thread.pfm_context) + +#ifdef CONFIG_SMP +#define SET_LAST_CPU(ctx, v) (ctx)->ctx_last_cpu = (v) +#define GET_LAST_CPU(ctx) (ctx)->ctx_last_cpu +#else +#define SET_LAST_CPU(ctx, v) do {} while(0) +#define GET_LAST_CPU(ctx) do {} while(0) +#endif + + +#define ctx_fl_block ctx_flags.block +#define ctx_fl_system ctx_flags.system +#define ctx_fl_using_dbreg ctx_flags.using_dbreg +#define ctx_fl_is_sampling ctx_flags.is_sampling +#define ctx_fl_excl_idle ctx_flags.excl_idle +#define ctx_fl_going_zombie ctx_flags.going_zombie +#define ctx_fl_trap_reason ctx_flags.trap_reason +#define ctx_fl_no_msg ctx_flags.no_msg +#define ctx_fl_can_restart ctx_flags.can_restart + +#define PFM_SET_WORK_PENDING(t, v) do { (t)->thread.pfm_needs_checking = v; } while(0); +#define PFM_GET_WORK_PENDING(t) (t)->thread.pfm_needs_checking + +/* + * global information about all sessions + * mostly used to synchronize between system wide and per-process + */ +typedef struct { + spinlock_t pfs_lock; /* lock the structure */ + + unsigned int pfs_task_sessions; /* number of per task sessions */ + unsigned int pfs_sys_sessions; /* number of per system wide sessions */ + unsigned int pfs_sys_use_dbregs; /* incremented when a system wide session uses debug regs */ + unsigned int pfs_ptrace_use_dbregs; /* incremented when a process uses debug regs */ + struct task_struct *pfs_sys_session[NR_CPUS]; /* point to task owning a system-wide session */ +} pfm_session_t; + +/* + * information about a PMC or PMD. + * dep_pmd[]: a bitmask of dependent PMD registers + * dep_pmc[]: a bitmask of dependent PMC registers + */ +typedef int (*pfm_reg_check_t)(struct task_struct *task, pfm_context_t *ctx, unsigned int cnum, unsigned long *val, struct pt_regs *regs); +typedef struct { + unsigned int type; + int pm_pos; + unsigned long default_value; /* power-on default value */ + unsigned long reserved_mask; /* bitmask of reserved bits */ + pfm_reg_check_t read_check; + pfm_reg_check_t write_check; + unsigned long dep_pmd[4]; + unsigned long dep_pmc[4]; +} pfm_reg_desc_t; + +/* assume cnum is a valid monitor */ +#define PMC_PM(cnum, val) (((val) >> (pmu_conf->pmc_desc[cnum].pm_pos)) & 0x1) + +/* + * This structure is initialized at boot time and contains + * a description of the PMU main characteristics. + * + * If the probe function is defined, detection is based + * on its return value: + * - 0 means recognized PMU + * - anything else means not supported + * When the probe function is not defined, then the pmu_family field + * is used and it must match the host CPU family such that: + * - cpu->family & config->pmu_family != 0 + */ +typedef struct { + unsigned long ovfl_val; /* overflow value for counters */ + + pfm_reg_desc_t *pmc_desc; /* detailed PMC register dependencies descriptions */ + pfm_reg_desc_t *pmd_desc; /* detailed PMD register dependencies descriptions */ + + unsigned int num_pmcs; /* number of PMCS: computed at init time */ + unsigned int num_pmds; /* number of PMDS: computed at init time */ + unsigned long impl_pmcs[4]; /* bitmask of implemented PMCS */ + unsigned long impl_pmds[4]; /* bitmask of implemented PMDS */ + + char *pmu_name; /* PMU family name */ + unsigned int pmu_family; /* cpuid family pattern used to identify pmu */ + unsigned int flags; /* pmu specific flags */ + unsigned int num_ibrs; /* number of IBRS: computed at init time */ + unsigned int num_dbrs; /* number of DBRS: computed at init time */ + unsigned int num_counters; /* PMC/PMD counting pairs : computed at init time */ + int (*probe)(void); /* customized probe routine */ + unsigned int use_rr_dbregs:1; /* set if debug registers used for range restriction */ +} pmu_config_t; +/* + * PMU specific flags + */ +#define PFM_PMU_IRQ_RESEND 1 /* PMU needs explicit IRQ resend */ + +/* + * debug register related type definitions + */ +typedef struct { + unsigned long ibr_mask:56; + unsigned long ibr_plm:4; + unsigned long ibr_ig:3; + unsigned long ibr_x:1; +} ibr_mask_reg_t; + +typedef struct { + unsigned long dbr_mask:56; + unsigned long dbr_plm:4; + unsigned long dbr_ig:2; + unsigned long dbr_w:1; + unsigned long dbr_r:1; +} dbr_mask_reg_t; + +typedef union { + unsigned long val; + ibr_mask_reg_t ibr; + dbr_mask_reg_t dbr; +} dbreg_t; + + +/* + * perfmon command descriptions + */ +typedef struct { + int (*cmd_func)(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs); + char *cmd_name; + int cmd_flags; + unsigned int cmd_narg; + size_t cmd_argsize; + int (*cmd_getsize)(void *arg, size_t *sz); +} pfm_cmd_desc_t; + +#define PFM_CMD_FD 0x01 /* command requires a file descriptor */ +#define PFM_CMD_ARG_READ 0x02 /* command must read argument(s) */ +#define PFM_CMD_ARG_RW 0x04 /* command must read/write argument(s) */ +#define PFM_CMD_STOP 0x08 /* command does not work on zombie context */ + + +#define PFM_CMD_NAME(cmd) pfm_cmd_tab[(cmd)].cmd_name +#define PFM_CMD_READ_ARG(cmd) (pfm_cmd_tab[(cmd)].cmd_flags & PFM_CMD_ARG_READ) +#define PFM_CMD_RW_ARG(cmd) (pfm_cmd_tab[(cmd)].cmd_flags & PFM_CMD_ARG_RW) +#define PFM_CMD_USE_FD(cmd) (pfm_cmd_tab[(cmd)].cmd_flags & PFM_CMD_FD) +#define PFM_CMD_STOPPED(cmd) (pfm_cmd_tab[(cmd)].cmd_flags & PFM_CMD_STOP) + +#define PFM_CMD_ARG_MANY -1 /* cannot be zero */ + +typedef struct { + int debug; /* turn on/off debugging via syslog */ + int debug_ovfl; /* turn on/off debug printk in overflow handler */ + int fastctxsw; /* turn on/off fast (unsecure) ctxsw */ + int expert_mode; /* turn on/off value checking */ + int debug_pfm_read; +} pfm_sysctl_t; + +typedef struct { + unsigned long pfm_spurious_ovfl_intr_count; /* keep track of spurious ovfl interrupts */ + unsigned long pfm_replay_ovfl_intr_count; /* keep track of replayed ovfl interrupts */ + unsigned long pfm_ovfl_intr_count; /* keep track of ovfl interrupts */ + unsigned long pfm_ovfl_intr_cycles; /* cycles spent processing ovfl interrupts */ + unsigned long pfm_ovfl_intr_cycles_min; /* min cycles spent processing ovfl interrupts */ + unsigned long pfm_ovfl_intr_cycles_max; /* max cycles spent processing ovfl interrupts */ + unsigned long pfm_smpl_handler_calls; + unsigned long pfm_smpl_handler_cycles; + char pad[SMP_CACHE_BYTES] ____cacheline_aligned; +} pfm_stats_t; + +/* + * perfmon internal variables + */ +static pfm_stats_t pfm_stats[NR_CPUS]; +static pfm_session_t pfm_sessions; /* global sessions information */ + +static struct proc_dir_entry *perfmon_dir; +static pfm_uuid_t pfm_null_uuid = {0,}; + +static spinlock_t pfm_buffer_fmt_lock; +static LIST_HEAD(pfm_buffer_fmt_list); + +static pmu_config_t *pmu_conf; + +/* sysctl() controls */ +static pfm_sysctl_t pfm_sysctl; +int pfm_debug_var; + +static ctl_table pfm_ctl_table[]={ + {1, "debug", &pfm_sysctl.debug, sizeof(int), 0666, NULL, &proc_dointvec, NULL,}, + {2, "debug_ovfl", &pfm_sysctl.debug_ovfl, sizeof(int), 0666, NULL, &proc_dointvec, NULL,}, + {3, "fastctxsw", &pfm_sysctl.fastctxsw, sizeof(int), 0600, NULL, &proc_dointvec, NULL,}, + {4, "expert_mode", &pfm_sysctl.expert_mode, sizeof(int), 0600, NULL, &proc_dointvec, NULL,}, + { 0, }, +}; +static ctl_table pfm_sysctl_dir[] = { + {1, "perfmon", NULL, 0, 0755, pfm_ctl_table, }, + {0,}, +}; +static ctl_table pfm_sysctl_root[] = { + {1, "kernel", NULL, 0, 0755, pfm_sysctl_dir, }, + {0,}, +}; +static struct ctl_table_header *pfm_sysctl_header; + +static int pfm_context_unload(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs); +static int pfm_flush(struct file *filp); + +#define pfm_get_cpu_var(v) __ia64_per_cpu_var(v) +#define pfm_get_cpu_data(a,b) per_cpu(a, b) + +static inline void +pfm_put_task(struct task_struct *task) +{ + if (task != current) put_task_struct(task); +} + +static inline void +pfm_set_task_notify(struct task_struct *task) +{ + struct thread_info *info; + + info = (struct thread_info *) ((char *) task + IA64_TASK_SIZE); + set_bit(TIF_NOTIFY_RESUME, &info->flags); +} + +static inline void +pfm_clear_task_notify(void) +{ + clear_thread_flag(TIF_NOTIFY_RESUME); +} + +static inline void +pfm_reserve_page(unsigned long a) +{ + SetPageReserved(vmalloc_to_page((void *)a)); +} +static inline void +pfm_unreserve_page(unsigned long a) +{ + ClearPageReserved(vmalloc_to_page((void*)a)); +} + +static inline unsigned long +pfm_protect_ctx_ctxsw(pfm_context_t *x) +{ + spin_lock(&(x)->ctx_lock); + return 0UL; +} + +static inline unsigned long +pfm_unprotect_ctx_ctxsw(pfm_context_t *x, unsigned long f) +{ + spin_unlock(&(x)->ctx_lock); +} + +static inline unsigned int +pfm_do_munmap(struct mm_struct *mm, unsigned long addr, size_t len, int acct) +{ + return do_munmap(mm, addr, len); +} + +static inline unsigned long +pfm_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags, unsigned long exec) +{ + return get_unmapped_area(file, addr, len, pgoff, flags); +} + + +static struct super_block * +pfmfs_get_sb(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) +{ + return get_sb_pseudo(fs_type, "pfm:", NULL, PFMFS_MAGIC); +} + +static struct file_system_type pfm_fs_type = { + .name = "pfmfs", + .get_sb = pfmfs_get_sb, + .kill_sb = kill_anon_super, +}; + +DEFINE_PER_CPU(unsigned long, pfm_syst_info); +DEFINE_PER_CPU(struct task_struct *, pmu_owner); +DEFINE_PER_CPU(pfm_context_t *, pmu_ctx); +DEFINE_PER_CPU(unsigned long, pmu_activation_number); + + +/* forward declaration */ +static struct file_operations pfm_file_ops; + +/* + * forward declarations + */ +#ifndef CONFIG_SMP +static void pfm_lazy_save_regs (struct task_struct *ta); +#endif + +void dump_pmu_state(const char *); +static int pfm_write_ibr_dbr(int mode, pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs); + +#include "perfmon_itanium.h" +#include "perfmon_mckinley.h" +#include "perfmon_generic.h" + +static pmu_config_t *pmu_confs[]={ + &pmu_conf_mck, + &pmu_conf_ita, + &pmu_conf_gen, /* must be last */ + NULL +}; + + +static int pfm_end_notify_user(pfm_context_t *ctx); + +static inline void +pfm_clear_psr_pp(void) +{ + ia64_rsm(IA64_PSR_PP); + ia64_srlz_i(); +} + +static inline void +pfm_set_psr_pp(void) +{ + ia64_ssm(IA64_PSR_PP); + ia64_srlz_i(); +} + +static inline void +pfm_clear_psr_up(void) +{ + ia64_rsm(IA64_PSR_UP); + ia64_srlz_i(); +} + +static inline void +pfm_set_psr_up(void) +{ + ia64_ssm(IA64_PSR_UP); + ia64_srlz_i(); +} + +static inline unsigned long +pfm_get_psr(void) +{ + unsigned long tmp; + tmp = ia64_getreg(_IA64_REG_PSR); + ia64_srlz_i(); + return tmp; +} + +static inline void +pfm_set_psr_l(unsigned long val) +{ + ia64_setreg(_IA64_REG_PSR_L, val); + ia64_srlz_i(); +} + +static inline void +pfm_freeze_pmu(void) +{ + ia64_set_pmc(0,1UL); + ia64_srlz_d(); +} + +static inline void +pfm_unfreeze_pmu(void) +{ + ia64_set_pmc(0,0UL); + ia64_srlz_d(); +} + +static inline void +pfm_restore_ibrs(unsigned long *ibrs, unsigned int nibrs) +{ + int i; + + for (i=0; i < nibrs; i++) { + ia64_set_ibr(i, ibrs[i]); + ia64_dv_serialize_instruction(); + } + ia64_srlz_i(); +} + +static inline void +pfm_restore_dbrs(unsigned long *dbrs, unsigned int ndbrs) +{ + int i; + + for (i=0; i < ndbrs; i++) { + ia64_set_dbr(i, dbrs[i]); + ia64_dv_serialize_data(); + } + ia64_srlz_d(); +} + +/* + * PMD[i] must be a counter. no check is made + */ +static inline unsigned long +pfm_read_soft_counter(pfm_context_t *ctx, int i) +{ + return ctx->ctx_pmds[i].val + (ia64_get_pmd(i) & pmu_conf->ovfl_val); +} + +/* + * PMD[i] must be a counter. no check is made + */ +static inline void +pfm_write_soft_counter(pfm_context_t *ctx, int i, unsigned long val) +{ + unsigned long ovfl_val = pmu_conf->ovfl_val; + + ctx->ctx_pmds[i].val = val & ~ovfl_val; + /* + * writing to unimplemented part is ignore, so we do not need to + * mask off top part + */ + ia64_set_pmd(i, val & ovfl_val); +} + +static pfm_msg_t * +pfm_get_new_msg(pfm_context_t *ctx) +{ + int idx, next; + + next = (ctx->ctx_msgq_tail+1) % PFM_MAX_MSGS; + + DPRINT(("ctx_fd=%p head=%d tail=%d\n", ctx, ctx->ctx_msgq_head, ctx->ctx_msgq_tail)); + if (next == ctx->ctx_msgq_head) return NULL; + + idx = ctx->ctx_msgq_tail; + ctx->ctx_msgq_tail = next; + + DPRINT(("ctx=%p head=%d tail=%d msg=%d\n", ctx, ctx->ctx_msgq_head, ctx->ctx_msgq_tail, idx)); + + return ctx->ctx_msgq+idx; +} + +static pfm_msg_t * +pfm_get_next_msg(pfm_context_t *ctx) +{ + pfm_msg_t *msg; + + DPRINT(("ctx=%p head=%d tail=%d\n", ctx, ctx->ctx_msgq_head, ctx->ctx_msgq_tail)); + + if (PFM_CTXQ_EMPTY(ctx)) return NULL; + + /* + * get oldest message + */ + msg = ctx->ctx_msgq+ctx->ctx_msgq_head; + + /* + * and move forward + */ + ctx->ctx_msgq_head = (ctx->ctx_msgq_head+1) % PFM_MAX_MSGS; + + DPRINT(("ctx=%p head=%d tail=%d type=%d\n", ctx, ctx->ctx_msgq_head, ctx->ctx_msgq_tail, msg->pfm_gen_msg.msg_type)); + + return msg; +} + +static void +pfm_reset_msgq(pfm_context_t *ctx) +{ + ctx->ctx_msgq_head = ctx->ctx_msgq_tail = 0; + DPRINT(("ctx=%p msgq reset\n", ctx)); +} + +static void * +pfm_rvmalloc(unsigned long size) +{ + void *mem; + unsigned long addr; + + size = PAGE_ALIGN(size); + mem = vmalloc(size); + if (mem) { + //printk("perfmon: CPU%d pfm_rvmalloc(%ld)=%p\n", smp_processor_id(), size, mem); + memset(mem, 0, size); + addr = (unsigned long)mem; + while (size > 0) { + pfm_reserve_page(addr); + addr+=PAGE_SIZE; + size-=PAGE_SIZE; + } + } + return mem; +} + +static void +pfm_rvfree(void *mem, unsigned long size) +{ + unsigned long addr; + + if (mem) { + DPRINT(("freeing physical buffer @%p size=%lu\n", mem, size)); + addr = (unsigned long) mem; + while ((long) size > 0) { + pfm_unreserve_page(addr); + addr+=PAGE_SIZE; + size-=PAGE_SIZE; + } + vfree(mem); + } + return; +} + +static pfm_context_t * +pfm_context_alloc(void) +{ + pfm_context_t *ctx; + + /* + * allocate context descriptor + * must be able to free with interrupts disabled + */ + ctx = kmalloc(sizeof(pfm_context_t), GFP_KERNEL); + if (ctx) { + memset(ctx, 0, sizeof(pfm_context_t)); + DPRINT(("alloc ctx @%p\n", ctx)); + } + return ctx; +} + +static void +pfm_context_free(pfm_context_t *ctx) +{ + if (ctx) { + DPRINT(("free ctx @%p\n", ctx)); + kfree(ctx); + } +} + +static void +pfm_mask_monitoring(struct task_struct *task) +{ + pfm_context_t *ctx = PFM_GET_CTX(task); + struct thread_struct *th = &task->thread; + unsigned long mask, val, ovfl_mask; + int i; + + DPRINT_ovfl(("masking monitoring for [%d]\n", task->pid)); + + ovfl_mask = pmu_conf->ovfl_val; + /* + * monitoring can only be masked as a result of a valid + * counter overflow. In UP, it means that the PMU still + * has an owner. Note that the owner can be different + * from the current task. However the PMU state belongs + * to the owner. + * In SMP, a valid overflow only happens when task is + * current. Therefore if we come here, we know that + * the PMU state belongs to the current task, therefore + * we can access the live registers. + * + * So in both cases, the live register contains the owner's + * state. We can ONLY touch the PMU registers and NOT the PSR. + * + * As a consequence to this call, the thread->pmds[] array + * contains stale information which must be ignored + * when context is reloaded AND monitoring is active (see + * pfm_restart). + */ + mask = ctx->ctx_used_pmds[0]; + for (i = 0; mask; i++, mask>>=1) { + /* skip non used pmds */ + if ((mask & 0x1) == 0) continue; + val = ia64_get_pmd(i); + + if (PMD_IS_COUNTING(i)) { + /* + * we rebuild the full 64 bit value of the counter + */ + ctx->ctx_pmds[i].val += (val & ovfl_mask); + } else { + ctx->ctx_pmds[i].val = val; + } + DPRINT_ovfl(("pmd[%d]=0x%lx hw_pmd=0x%lx\n", + i, + ctx->ctx_pmds[i].val, + val & ovfl_mask)); + } + /* + * mask monitoring by setting the privilege level to 0 + * we cannot use psr.pp/psr.up for this, it is controlled by + * the user + * + * if task is current, modify actual registers, otherwise modify + * thread save state, i.e., what will be restored in pfm_load_regs() + */ + mask = ctx->ctx_used_monitors[0] >> PMU_FIRST_COUNTER; + for(i= PMU_FIRST_COUNTER; mask; i++, mask>>=1) { + if ((mask & 0x1) == 0UL) continue; + ia64_set_pmc(i, th->pmcs[i] & ~0xfUL); + th->pmcs[i] &= ~0xfUL; + DPRINT_ovfl(("pmc[%d]=0x%lx\n", i, th->pmcs[i])); + } + /* + * make all of this visible + */ + ia64_srlz_d(); +} + +/* + * must always be done with task == current + * + * context must be in MASKED state when calling + */ +static void +pfm_restore_monitoring(struct task_struct *task) +{ + pfm_context_t *ctx = PFM_GET_CTX(task); + struct thread_struct *th = &task->thread; + unsigned long mask, ovfl_mask; + unsigned long psr, val; + int i, is_system; + + is_system = ctx->ctx_fl_system; + ovfl_mask = pmu_conf->ovfl_val; + + if (task != current) { + printk(KERN_ERR "perfmon.%d: invalid task[%d] current[%d]\n", __LINE__, task->pid, current->pid); + return; + } + if (ctx->ctx_state != PFM_CTX_MASKED) { + printk(KERN_ERR "perfmon.%d: task[%d] current[%d] invalid state=%d\n", __LINE__, + task->pid, current->pid, ctx->ctx_state); + return; + } + psr = pfm_get_psr(); + /* + * monitoring is masked via the PMC. + * As we restore their value, we do not want each counter to + * restart right away. We stop monitoring using the PSR, + * restore the PMC (and PMD) and then re-establish the psr + * as it was. Note that there can be no pending overflow at + * this point, because monitoring was MASKED. + * + * system-wide session are pinned and self-monitoring + */ + if (is_system && (PFM_CPUINFO_GET() & PFM_CPUINFO_DCR_PP)) { + /* disable dcr pp */ + ia64_setreg(_IA64_REG_CR_DCR, ia64_getreg(_IA64_REG_CR_DCR) & ~IA64_DCR_PP); + pfm_clear_psr_pp(); + } else { + pfm_clear_psr_up(); + } + /* + * first, we restore the PMD + */ + mask = ctx->ctx_used_pmds[0]; + for (i = 0; mask; i++, mask>>=1) { + /* skip non used pmds */ + if ((mask & 0x1) == 0) continue; + + if (PMD_IS_COUNTING(i)) { + /* + * we split the 64bit value according to + * counter width + */ + val = ctx->ctx_pmds[i].val & ovfl_mask; + ctx->ctx_pmds[i].val &= ~ovfl_mask; + } else { + val = ctx->ctx_pmds[i].val; + } + ia64_set_pmd(i, val); + + DPRINT(("pmd[%d]=0x%lx hw_pmd=0x%lx\n", + i, + ctx->ctx_pmds[i].val, + val)); + } + /* + * restore the PMCs + */ + mask = ctx->ctx_used_monitors[0] >> PMU_FIRST_COUNTER; + for(i= PMU_FIRST_COUNTER; mask; i++, mask>>=1) { + if ((mask & 0x1) == 0UL) continue; + th->pmcs[i] = ctx->ctx_pmcs[i]; + ia64_set_pmc(i, th->pmcs[i]); + DPRINT(("[%d] pmc[%d]=0x%lx\n", task->pid, i, th->pmcs[i])); + } + ia64_srlz_d(); + + /* + * must restore DBR/IBR because could be modified while masked + * XXX: need to optimize + */ + if (ctx->ctx_fl_using_dbreg) { + pfm_restore_ibrs(ctx->ctx_ibrs, pmu_conf->num_ibrs); + pfm_restore_dbrs(ctx->ctx_dbrs, pmu_conf->num_dbrs); + } + + /* + * now restore PSR + */ + if (is_system && (PFM_CPUINFO_GET() & PFM_CPUINFO_DCR_PP)) { + /* enable dcr pp */ + ia64_setreg(_IA64_REG_CR_DCR, ia64_getreg(_IA64_REG_CR_DCR) | IA64_DCR_PP); + ia64_srlz_i(); + } + pfm_set_psr_l(psr); +} + +static inline void +pfm_save_pmds(unsigned long *pmds, unsigned long mask) +{ + int i; + + ia64_srlz_d(); + + for (i=0; mask; i++, mask>>=1) { + if (mask & 0x1) pmds[i] = ia64_get_pmd(i); + } +} + +/* + * reload from thread state (used for ctxw only) + */ +static inline void +pfm_restore_pmds(unsigned long *pmds, unsigned long mask) +{ + int i; + unsigned long val, ovfl_val = pmu_conf->ovfl_val; + + for (i=0; mask; i++, mask>>=1) { + if ((mask & 0x1) == 0) continue; + val = PMD_IS_COUNTING(i) ? pmds[i] & ovfl_val : pmds[i]; + ia64_set_pmd(i, val); + } + ia64_srlz_d(); +} + +/* + * propagate PMD from context to thread-state + */ +static inline void +pfm_copy_pmds(struct task_struct *task, pfm_context_t *ctx) +{ + struct thread_struct *thread = &task->thread; + unsigned long ovfl_val = pmu_conf->ovfl_val; + unsigned long mask = ctx->ctx_all_pmds[0]; + unsigned long val; + int i; + + DPRINT(("mask=0x%lx\n", mask)); + + for (i=0; mask; i++, mask>>=1) { + + val = ctx->ctx_pmds[i].val; + + /* + * We break up the 64 bit value into 2 pieces + * the lower bits go to the machine state in the + * thread (will be reloaded on ctxsw in). + * The upper part stays in the soft-counter. + */ + if (PMD_IS_COUNTING(i)) { + ctx->ctx_pmds[i].val = val & ~ovfl_val; + val &= ovfl_val; + } + thread->pmds[i] = val; + + DPRINT(("pmd[%d]=0x%lx soft_val=0x%lx\n", + i, + thread->pmds[i], + ctx->ctx_pmds[i].val)); + } +} + +/* + * propagate PMC from context to thread-state + */ +static inline void +pfm_copy_pmcs(struct task_struct *task, pfm_context_t *ctx) +{ + struct thread_struct *thread = &task->thread; + unsigned long mask = ctx->ctx_all_pmcs[0]; + int i; + + DPRINT(("mask=0x%lx\n", mask)); + + for (i=0; mask; i++, mask>>=1) { + /* masking 0 with ovfl_val yields 0 */ + thread->pmcs[i] = ctx->ctx_pmcs[i]; + DPRINT(("pmc[%d]=0x%lx\n", i, thread->pmcs[i])); + } +} + + + +static inline void +pfm_restore_pmcs(unsigned long *pmcs, unsigned long mask) +{ + int i; + + for (i=0; mask; i++, mask>>=1) { + if ((mask & 0x1) == 0) continue; + ia64_set_pmc(i, pmcs[i]); + } + ia64_srlz_d(); +} + +static inline int +pfm_uuid_cmp(pfm_uuid_t a, pfm_uuid_t b) +{ + return memcmp(a, b, sizeof(pfm_uuid_t)); +} + +static inline int +pfm_buf_fmt_exit(pfm_buffer_fmt_t *fmt, struct task_struct *task, void *buf, struct pt_regs *regs) +{ + int ret = 0; + if (fmt->fmt_exit) ret = (*fmt->fmt_exit)(task, buf, regs); + return ret; +} + +static inline int +pfm_buf_fmt_getsize(pfm_buffer_fmt_t *fmt, struct task_struct *task, unsigned int flags, int cpu, void *arg, unsigned long *size) +{ + int ret = 0; + if (fmt->fmt_getsize) ret = (*fmt->fmt_getsize)(task, flags, cpu, arg, size); + return ret; +} + + +static inline int +pfm_buf_fmt_validate(pfm_buffer_fmt_t *fmt, struct task_struct *task, unsigned int flags, + int cpu, void *arg) +{ + int ret = 0; + if (fmt->fmt_validate) ret = (*fmt->fmt_validate)(task, flags, cpu, arg); + return ret; +} + +static inline int +pfm_buf_fmt_init(pfm_buffer_fmt_t *fmt, struct task_struct *task, void *buf, unsigned int flags, + int cpu, void *arg) +{ + int ret = 0; + if (fmt->fmt_init) ret = (*fmt->fmt_init)(task, buf, flags, cpu, arg); + return ret; +} + +static inline int +pfm_buf_fmt_restart(pfm_buffer_fmt_t *fmt, struct task_struct *task, pfm_ovfl_ctrl_t *ctrl, void *buf, struct pt_regs *regs) +{ + int ret = 0; + if (fmt->fmt_restart) ret = (*fmt->fmt_restart)(task, ctrl, buf, regs); + return ret; +} + +static inline int +pfm_buf_fmt_restart_active(pfm_buffer_fmt_t *fmt, struct task_struct *task, pfm_ovfl_ctrl_t *ctrl, void *buf, struct pt_regs *regs) +{ + int ret = 0; + if (fmt->fmt_restart_active) ret = (*fmt->fmt_restart_active)(task, ctrl, buf, regs); + return ret; +} + +static pfm_buffer_fmt_t * +__pfm_find_buffer_fmt(pfm_uuid_t uuid) +{ + struct list_head * pos; + pfm_buffer_fmt_t * entry; + + list_for_each(pos, &pfm_buffer_fmt_list) { + entry = list_entry(pos, pfm_buffer_fmt_t, fmt_list); + if (pfm_uuid_cmp(uuid, entry->fmt_uuid) == 0) + return entry; + } + return NULL; +} + +/* + * find a buffer format based on its uuid + */ +static pfm_buffer_fmt_t * +pfm_find_buffer_fmt(pfm_uuid_t uuid) +{ + pfm_buffer_fmt_t * fmt; + spin_lock(&pfm_buffer_fmt_lock); + fmt = __pfm_find_buffer_fmt(uuid); + spin_unlock(&pfm_buffer_fmt_lock); + return fmt; +} + +int +pfm_register_buffer_fmt(pfm_buffer_fmt_t *fmt) +{ + int ret = 0; + + /* some sanity checks */ + if (fmt == NULL || fmt->fmt_name == NULL) return -EINVAL; + + /* we need at least a handler */ + if (fmt->fmt_handler == NULL) return -EINVAL; + + /* + * XXX: need check validity of fmt_arg_size + */ + + spin_lock(&pfm_buffer_fmt_lock); + + if (__pfm_find_buffer_fmt(fmt->fmt_uuid)) { + printk(KERN_ERR "perfmon: duplicate sampling format: %s\n", fmt->fmt_name); + ret = -EBUSY; + goto out; + } + list_add(&fmt->fmt_list, &pfm_buffer_fmt_list); + printk(KERN_INFO "perfmon: added sampling format %s\n", fmt->fmt_name); + +out: + spin_unlock(&pfm_buffer_fmt_lock); + return ret; +} +EXPORT_SYMBOL(pfm_register_buffer_fmt); + +int +pfm_unregister_buffer_fmt(pfm_uuid_t uuid) +{ + pfm_buffer_fmt_t *fmt; + int ret = 0; + + spin_lock(&pfm_buffer_fmt_lock); + + fmt = __pfm_find_buffer_fmt(uuid); + if (!fmt) { + printk(KERN_ERR "perfmon: cannot unregister format, not found\n"); + ret = -EINVAL; + goto out; + } + list_del_init(&fmt->fmt_list); + printk(KERN_INFO "perfmon: removed sampling format: %s\n", fmt->fmt_name); + +out: + spin_unlock(&pfm_buffer_fmt_lock); + return ret; + +} +EXPORT_SYMBOL(pfm_unregister_buffer_fmt); + +static int +pfm_reserve_session(struct task_struct *task, int is_syswide, unsigned int cpu) +{ + unsigned long flags; + /* + * validy checks on cpu_mask have been done upstream + */ + LOCK_PFS(flags); + + DPRINT(("in sys_sessions=%u task_sessions=%u dbregs=%u syswide=%d cpu=%u\n", + pfm_sessions.pfs_sys_sessions, + pfm_sessions.pfs_task_sessions, + pfm_sessions.pfs_sys_use_dbregs, + is_syswide, + cpu)); + + if (is_syswide) { + /* + * cannot mix system wide and per-task sessions + */ + if (pfm_sessions.pfs_task_sessions > 0UL) { + DPRINT(("system wide not possible, %u conflicting task_sessions\n", + pfm_sessions.pfs_task_sessions)); + goto abort; + } + + if (pfm_sessions.pfs_sys_session[cpu]) goto error_conflict; + + DPRINT(("reserving system wide session on CPU%u currently on CPU%u\n", cpu, smp_processor_id())); + + pfm_sessions.pfs_sys_session[cpu] = task; + + pfm_sessions.pfs_sys_sessions++ ; + + } else { + if (pfm_sessions.pfs_sys_sessions) goto abort; + pfm_sessions.pfs_task_sessions++; + } + + DPRINT(("out sys_sessions=%u task_sessions=%u dbregs=%u syswide=%d cpu=%u\n", + pfm_sessions.pfs_sys_sessions, + pfm_sessions.pfs_task_sessions, + pfm_sessions.pfs_sys_use_dbregs, + is_syswide, + cpu)); + + UNLOCK_PFS(flags); + + return 0; + +error_conflict: + DPRINT(("system wide not possible, conflicting session [%d] on CPU%d\n", + pfm_sessions.pfs_sys_session[cpu]->pid, + smp_processor_id())); +abort: + UNLOCK_PFS(flags); + + return -EBUSY; + +} + +static int +pfm_unreserve_session(pfm_context_t *ctx, int is_syswide, unsigned int cpu) +{ + unsigned long flags; + /* + * validy checks on cpu_mask have been done upstream + */ + LOCK_PFS(flags); + + DPRINT(("in sys_sessions=%u task_sessions=%u dbregs=%u syswide=%d cpu=%u\n", + pfm_sessions.pfs_sys_sessions, + pfm_sessions.pfs_task_sessions, + pfm_sessions.pfs_sys_use_dbregs, + is_syswide, + cpu)); + + + if (is_syswide) { + pfm_sessions.pfs_sys_session[cpu] = NULL; + /* + * would not work with perfmon+more than one bit in cpu_mask + */ + if (ctx && ctx->ctx_fl_using_dbreg) { + if (pfm_sessions.pfs_sys_use_dbregs == 0) { + printk(KERN_ERR "perfmon: invalid release for ctx %p sys_use_dbregs=0\n", ctx); + } else { + pfm_sessions.pfs_sys_use_dbregs--; + } + } + pfm_sessions.pfs_sys_sessions--; + } else { + pfm_sessions.pfs_task_sessions--; + } + DPRINT(("out sys_sessions=%u task_sessions=%u dbregs=%u syswide=%d cpu=%u\n", + pfm_sessions.pfs_sys_sessions, + pfm_sessions.pfs_task_sessions, + pfm_sessions.pfs_sys_use_dbregs, + is_syswide, + cpu)); + + UNLOCK_PFS(flags); + + return 0; +} + +/* + * removes virtual mapping of the sampling buffer. + * IMPORTANT: cannot be called with interrupts disable, e.g. inside + * a PROTECT_CTX() section. + */ +static int +pfm_remove_smpl_mapping(struct task_struct *task, void *vaddr, unsigned long size) +{ + int r; + + /* sanity checks */ + if (task->mm == NULL || size == 0UL || vaddr == NULL) { + printk(KERN_ERR "perfmon: pfm_remove_smpl_mapping [%d] invalid context mm=%p\n", task->pid, task->mm); + return -EINVAL; + } + + DPRINT(("smpl_vaddr=%p size=%lu\n", vaddr, size)); + + /* + * does the actual unmapping + */ + down_write(&task->mm->mmap_sem); + + DPRINT(("down_write done smpl_vaddr=%p size=%lu\n", vaddr, size)); + + r = pfm_do_munmap(task->mm, (unsigned long)vaddr, size, 0); + + up_write(&task->mm->mmap_sem); + if (r !=0) { + printk(KERN_ERR "perfmon: [%d] unable to unmap sampling buffer @%p size=%lu\n", task->pid, vaddr, size); + } + + DPRINT(("do_unmap(%p, %lu)=%d\n", vaddr, size, r)); + + return 0; +} + +/* + * free actual physical storage used by sampling buffer + */ +#if 0 +static int +pfm_free_smpl_buffer(pfm_context_t *ctx) +{ + pfm_buffer_fmt_t *fmt; + + if (ctx->ctx_smpl_hdr == NULL) goto invalid_free; + + /* + * we won't use the buffer format anymore + */ + fmt = ctx->ctx_buf_fmt; + + DPRINT(("sampling buffer @%p size %lu vaddr=%p\n", + ctx->ctx_smpl_hdr, + ctx->ctx_smpl_size, + ctx->ctx_smpl_vaddr)); + + pfm_buf_fmt_exit(fmt, current, NULL, NULL); + + /* + * free the buffer + */ + pfm_rvfree(ctx->ctx_smpl_hdr, ctx->ctx_smpl_size); + + ctx->ctx_smpl_hdr = NULL; + ctx->ctx_smpl_size = 0UL; + + return 0; + +invalid_free: + printk(KERN_ERR "perfmon: pfm_free_smpl_buffer [%d] no buffer\n", current->pid); + return -EINVAL; +} +#endif + +static inline void +pfm_exit_smpl_buffer(pfm_buffer_fmt_t *fmt) +{ + if (fmt == NULL) return; + + pfm_buf_fmt_exit(fmt, current, NULL, NULL); + +} + +/* + * pfmfs should _never_ be mounted by userland - too much of security hassle, + * no real gain from having the whole whorehouse mounted. So we don't need + * any operations on the root directory. However, we need a non-trivial + * d_name - pfm: will go nicely and kill the special-casing in procfs. + */ +static struct vfsmount *pfmfs_mnt; + +static int __init +init_pfm_fs(void) +{ + int err = register_filesystem(&pfm_fs_type); + if (!err) { + pfmfs_mnt = kern_mount(&pfm_fs_type); + err = PTR_ERR(pfmfs_mnt); + if (IS_ERR(pfmfs_mnt)) + unregister_filesystem(&pfm_fs_type); + else + err = 0; + } + return err; +} + +static void __exit +exit_pfm_fs(void) +{ + unregister_filesystem(&pfm_fs_type); + mntput(pfmfs_mnt); +} + +static ssize_t +pfm_read(struct file *filp, char __user *buf, size_t size, loff_t *ppos) +{ + pfm_context_t *ctx; + pfm_msg_t *msg; + ssize_t ret; + unsigned long flags; + DECLARE_WAITQUEUE(wait, current); + if (PFM_IS_FILE(filp) == 0) { + printk(KERN_ERR "perfmon: pfm_poll: bad magic [%d]\n", current->pid); + return -EINVAL; + } + + ctx = (pfm_context_t *)filp->private_data; + if (ctx == NULL) { + printk(KERN_ERR "perfmon: pfm_read: NULL ctx [%d]\n", current->pid); + return -EINVAL; + } + + /* + * check even when there is no message + */ + if (size < sizeof(pfm_msg_t)) { + DPRINT(("message is too small ctx=%p (>=%ld)\n", ctx, sizeof(pfm_msg_t))); + return -EINVAL; + } + + PROTECT_CTX(ctx, flags); + + /* + * put ourselves on the wait queue + */ + add_wait_queue(&ctx->ctx_msgq_wait, &wait); + + + for(;;) { + /* + * check wait queue + */ + + set_current_state(TASK_INTERRUPTIBLE); + + DPRINT(("head=%d tail=%d\n", ctx->ctx_msgq_head, ctx->ctx_msgq_tail)); + + ret = 0; + if(PFM_CTXQ_EMPTY(ctx) == 0) break; + + UNPROTECT_CTX(ctx, flags); + + /* + * check non-blocking read + */ + ret = -EAGAIN; + if(filp->f_flags & O_NONBLOCK) break; + + /* + * check pending signals + */ + if(signal_pending(current)) { + ret = -EINTR; + break; + } + /* + * no message, so wait + */ + schedule(); + + PROTECT_CTX(ctx, flags); + } + DPRINT(("[%d] back to running ret=%ld\n", current->pid, ret)); + set_current_state(TASK_RUNNING); + remove_wait_queue(&ctx->ctx_msgq_wait, &wait); + + if (ret < 0) goto abort; + + ret = -EINVAL; + msg = pfm_get_next_msg(ctx); + if (msg == NULL) { + printk(KERN_ERR "perfmon: pfm_read no msg for ctx=%p [%d]\n", ctx, current->pid); + goto abort_locked; + } + + DPRINT(("[%d] fd=%d type=%d\n", current->pid, msg->pfm_gen_msg.msg_ctx_fd, msg->pfm_gen_msg.msg_type)); + + ret = -EFAULT; + if(copy_to_user(buf, msg, sizeof(pfm_msg_t)) == 0) ret = sizeof(pfm_msg_t); + +abort_locked: + UNPROTECT_CTX(ctx, flags); +abort: + return ret; +} + +static ssize_t +pfm_write(struct file *file, const char __user *ubuf, + size_t size, loff_t *ppos) +{ + DPRINT(("pfm_write called\n")); + return -EINVAL; +} + +static unsigned int +pfm_poll(struct file *filp, poll_table * wait) +{ + pfm_context_t *ctx; + unsigned long flags; + unsigned int mask = 0; + + if (PFM_IS_FILE(filp) == 0) { + printk(KERN_ERR "perfmon: pfm_poll: bad magic [%d]\n", current->pid); + return 0; + } + + ctx = (pfm_context_t *)filp->private_data; + if (ctx == NULL) { + printk(KERN_ERR "perfmon: pfm_poll: NULL ctx [%d]\n", current->pid); + return 0; + } + + + DPRINT(("pfm_poll ctx_fd=%d before poll_wait\n", ctx->ctx_fd)); + + poll_wait(filp, &ctx->ctx_msgq_wait, wait); + + PROTECT_CTX(ctx, flags); + + if (PFM_CTXQ_EMPTY(ctx) == 0) + mask = POLLIN | POLLRDNORM; + + UNPROTECT_CTX(ctx, flags); + + DPRINT(("pfm_poll ctx_fd=%d mask=0x%x\n", ctx->ctx_fd, mask)); + + return mask; +} + +static int +pfm_ioctl(struct inode *inode, struct file *file, unsigned int cmd, unsigned long arg) +{ + DPRINT(("pfm_ioctl called\n")); + return -EINVAL; +} + +/* + * interrupt cannot be masked when coming here + */ +static inline int +pfm_do_fasync(int fd, struct file *filp, pfm_context_t *ctx, int on) +{ + int ret; + + ret = fasync_helper (fd, filp, on, &ctx->ctx_async_queue); + + DPRINT(("pfm_fasync called by [%d] on ctx_fd=%d on=%d async_queue=%p ret=%d\n", + current->pid, + fd, + on, + ctx->ctx_async_queue, ret)); + + return ret; +} + +static int +pfm_fasync(int fd, struct file *filp, int on) +{ + pfm_context_t *ctx; + int ret; + + if (PFM_IS_FILE(filp) == 0) { + printk(KERN_ERR "perfmon: pfm_fasync bad magic [%d]\n", current->pid); + return -EBADF; + } + + ctx = (pfm_context_t *)filp->private_data; + if (ctx == NULL) { + printk(KERN_ERR "perfmon: pfm_fasync NULL ctx [%d]\n", current->pid); + return -EBADF; + } + /* + * we cannot mask interrupts during this call because this may + * may go to sleep if memory is not readily avalaible. + * + * We are protected from the conetxt disappearing by the get_fd()/put_fd() + * done in caller. Serialization of this function is ensured by caller. + */ + ret = pfm_do_fasync(fd, filp, ctx, on); + + + DPRINT(("pfm_fasync called on ctx_fd=%d on=%d async_queue=%p ret=%d\n", + fd, + on, + ctx->ctx_async_queue, ret)); + + return ret; +} + +#ifdef CONFIG_SMP +/* + * this function is exclusively called from pfm_close(). + * The context is not protected at that time, nor are interrupts + * on the remote CPU. That's necessary to avoid deadlocks. + */ +static void +pfm_syswide_force_stop(void *info) +{ + pfm_context_t *ctx = (pfm_context_t *)info; + struct pt_regs *regs = ia64_task_regs(current); + struct task_struct *owner; + unsigned long flags; + int ret; + + if (ctx->ctx_cpu != smp_processor_id()) { + printk(KERN_ERR "perfmon: pfm_syswide_force_stop for CPU%d but on CPU%d\n", + ctx->ctx_cpu, + smp_processor_id()); + return; + } + owner = GET_PMU_OWNER(); + if (owner != ctx->ctx_task) { + printk(KERN_ERR "perfmon: pfm_syswide_force_stop CPU%d unexpected owner [%d] instead of [%d]\n", + smp_processor_id(), + owner->pid, ctx->ctx_task->pid); + return; + } + if (GET_PMU_CTX() != ctx) { + printk(KERN_ERR "perfmon: pfm_syswide_force_stop CPU%d unexpected ctx %p instead of %p\n", + smp_processor_id(), + GET_PMU_CTX(), ctx); + return; + } + + DPRINT(("on CPU%d forcing system wide stop for [%d]\n", smp_processor_id(), ctx->ctx_task->pid)); + /* + * the context is already protected in pfm_close(), we simply + * need to mask interrupts to avoid a PMU interrupt race on + * this CPU + */ + local_irq_save(flags); + + ret = pfm_context_unload(ctx, NULL, 0, regs); + if (ret) { + DPRINT(("context_unload returned %d\n", ret)); + } + + /* + * unmask interrupts, PMU interrupts are now spurious here + */ + local_irq_restore(flags); +} + +static void +pfm_syswide_cleanup_other_cpu(pfm_context_t *ctx) +{ + int ret; + + DPRINT(("calling CPU%d for cleanup\n", ctx->ctx_cpu)); + ret = smp_call_function_single(ctx->ctx_cpu, pfm_syswide_force_stop, ctx, 0, 1); + DPRINT(("called CPU%d for cleanup ret=%d\n", ctx->ctx_cpu, ret)); +} +#endif /* CONFIG_SMP */ + +/* + * called for each close(). Partially free resources. + * When caller is self-monitoring, the context is unloaded. + */ +static int +pfm_flush(struct file *filp) +{ + pfm_context_t *ctx; + struct task_struct *task; + struct pt_regs *regs; + unsigned long flags; + unsigned long smpl_buf_size = 0UL; + void *smpl_buf_vaddr = NULL; + int state, is_system; + + if (PFM_IS_FILE(filp) == 0) { + DPRINT(("bad magic for\n")); + return -EBADF; + } + + ctx = (pfm_context_t *)filp->private_data; + if (ctx == NULL) { + printk(KERN_ERR "perfmon: pfm_flush: NULL ctx [%d]\n", current->pid); + return -EBADF; + } + + /* + * remove our file from the async queue, if we use this mode. + * This can be done without the context being protected. We come + * here when the context has become unreacheable by other tasks. + * + * We may still have active monitoring at this point and we may + * end up in pfm_overflow_handler(). However, fasync_helper() + * operates with interrupts disabled and it cleans up the + * queue. If the PMU handler is called prior to entering + * fasync_helper() then it will send a signal. If it is + * invoked after, it will find an empty queue and no + * signal will be sent. In both case, we are safe + */ + if (filp->f_flags & FASYNC) { + DPRINT(("cleaning up async_queue=%p\n", ctx->ctx_async_queue)); + pfm_do_fasync (-1, filp, ctx, 0); + } + + PROTECT_CTX(ctx, flags); + + state = ctx->ctx_state; + is_system = ctx->ctx_fl_system; + + task = PFM_CTX_TASK(ctx); + regs = ia64_task_regs(task); + + DPRINT(("ctx_state=%d is_current=%d\n", + state, + task == current ? 1 : 0)); + + /* + * if state == UNLOADED, then task is NULL + */ + + /* + * we must stop and unload because we are losing access to the context. + */ + if (task == current) { +#ifdef CONFIG_SMP + /* + * the task IS the owner but it migrated to another CPU: that's bad + * but we must handle this cleanly. Unfortunately, the kernel does + * not provide a mechanism to block migration (while the context is loaded). + * + * We need to release the resource on the ORIGINAL cpu. + */ + if (is_system && ctx->ctx_cpu != smp_processor_id()) { + + DPRINT(("should be running on CPU%d\n", ctx->ctx_cpu)); + /* + * keep context protected but unmask interrupt for IPI + */ + local_irq_restore(flags); + + pfm_syswide_cleanup_other_cpu(ctx); + + /* + * restore interrupt masking + */ + local_irq_save(flags); + + /* + * context is unloaded at this point + */ + } else +#endif /* CONFIG_SMP */ + { + + DPRINT(("forcing unload\n")); + /* + * stop and unload, returning with state UNLOADED + * and session unreserved. + */ + pfm_context_unload(ctx, NULL, 0, regs); + + DPRINT(("ctx_state=%d\n", ctx->ctx_state)); + } + } + + /* + * remove virtual mapping, if any, for the calling task. + * cannot reset ctx field until last user is calling close(). + * + * ctx_smpl_vaddr must never be cleared because it is needed + * by every task with access to the context + * + * When called from do_exit(), the mm context is gone already, therefore + * mm is NULL, i.e., the VMA is already gone and we do not have to + * do anything here + */ + if (ctx->ctx_smpl_vaddr && current->mm) { + smpl_buf_vaddr = ctx->ctx_smpl_vaddr; + smpl_buf_size = ctx->ctx_smpl_size; + } + + UNPROTECT_CTX(ctx, flags); + + /* + * if there was a mapping, then we systematically remove it + * at this point. Cannot be done inside critical section + * because some VM function reenables interrupts. + * + */ + if (smpl_buf_vaddr) pfm_remove_smpl_mapping(current, smpl_buf_vaddr, smpl_buf_size); + + return 0; +} +/* + * called either on explicit close() or from exit_files(). + * Only the LAST user of the file gets to this point, i.e., it is + * called only ONCE. + * + * IMPORTANT: we get called ONLY when the refcnt on the file gets to zero + * (fput()),i.e, last task to access the file. Nobody else can access the + * file at this point. + * + * When called from exit_files(), the VMA has been freed because exit_mm() + * is executed before exit_files(). + * + * When called from exit_files(), the current task is not yet ZOMBIE but we + * flush the PMU state to the context. + */ +static int +pfm_close(struct inode *inode, struct file *filp) +{ + pfm_context_t *ctx; + struct task_struct *task; + struct pt_regs *regs; + DECLARE_WAITQUEUE(wait, current); + unsigned long flags; + unsigned long smpl_buf_size = 0UL; + void *smpl_buf_addr = NULL; + int free_possible = 1; + int state, is_system; + + DPRINT(("pfm_close called private=%p\n", filp->private_data)); + + if (PFM_IS_FILE(filp) == 0) { + DPRINT(("bad magic\n")); + return -EBADF; + } + + ctx = (pfm_context_t *)filp->private_data; + if (ctx == NULL) { + printk(KERN_ERR "perfmon: pfm_close: NULL ctx [%d]\n", current->pid); + return -EBADF; + } + + PROTECT_CTX(ctx, flags); + + state = ctx->ctx_state; + is_system = ctx->ctx_fl_system; + + task = PFM_CTX_TASK(ctx); + regs = ia64_task_regs(task); + + DPRINT(("ctx_state=%d is_current=%d\n", + state, + task == current ? 1 : 0)); + + /* + * if task == current, then pfm_flush() unloaded the context + */ + if (state == PFM_CTX_UNLOADED) goto doit; + + /* + * context is loaded/masked and task != current, we need to + * either force an unload or go zombie + */ + + /* + * The task is currently blocked or will block after an overflow. + * we must force it to wakeup to get out of the + * MASKED state and transition to the unloaded state by itself. + * + * This situation is only possible for per-task mode + */ + if (state == PFM_CTX_MASKED && CTX_OVFL_NOBLOCK(ctx) == 0) { + + /* + * set a "partial" zombie state to be checked + * upon return from down() in pfm_handle_work(). + * + * We cannot use the ZOMBIE state, because it is checked + * by pfm_load_regs() which is called upon wakeup from down(). + * In such case, it would free the context and then we would + * return to pfm_handle_work() which would access the + * stale context. Instead, we set a flag invisible to pfm_load_regs() + * but visible to pfm_handle_work(). + * + * For some window of time, we have a zombie context with + * ctx_state = MASKED and not ZOMBIE + */ + ctx->ctx_fl_going_zombie = 1; + + /* + * force task to wake up from MASKED state + */ + up(&ctx->ctx_restart_sem); + + DPRINT(("waking up ctx_state=%d\n", state)); + + /* + * put ourself to sleep waiting for the other + * task to report completion + * + * the context is protected by mutex, therefore there + * is no risk of being notified of completion before + * begin actually on the waitq. + */ + set_current_state(TASK_INTERRUPTIBLE); + add_wait_queue(&ctx->ctx_zombieq, &wait); + + UNPROTECT_CTX(ctx, flags); + + /* + * XXX: check for signals : + * - ok for explicit close + * - not ok when coming from exit_files() + */ + schedule(); + + + PROTECT_CTX(ctx, flags); + + + remove_wait_queue(&ctx->ctx_zombieq, &wait); + set_current_state(TASK_RUNNING); + + /* + * context is unloaded at this point + */ + DPRINT(("after zombie wakeup ctx_state=%d for\n", state)); + } + else if (task != current) { +#ifdef CONFIG_SMP + /* + * switch context to zombie state + */ + ctx->ctx_state = PFM_CTX_ZOMBIE; + + DPRINT(("zombie ctx for [%d]\n", task->pid)); + /* + * cannot free the context on the spot. deferred until + * the task notices the ZOMBIE state + */ + free_possible = 0; +#else + pfm_context_unload(ctx, NULL, 0, regs); +#endif + } + +doit: + /* reload state, may have changed during opening of critical section */ + state = ctx->ctx_state; + + /* + * the context is still attached to a task (possibly current) + * we cannot destroy it right now + */ + + /* + * we must free the sampling buffer right here because + * we cannot rely on it being cleaned up later by the + * monitored task. It is not possible to free vmalloc'ed + * memory in pfm_load_regs(). Instead, we remove the buffer + * now. should there be subsequent PMU overflow originally + * meant for sampling, the will be converted to spurious + * and that's fine because the monitoring tools is gone anyway. + */ + if (ctx->ctx_smpl_hdr) { + smpl_buf_addr = ctx->ctx_smpl_hdr; + smpl_buf_size = ctx->ctx_smpl_size; + /* no more sampling */ + ctx->ctx_smpl_hdr = NULL; + ctx->ctx_fl_is_sampling = 0; + } + + DPRINT(("ctx_state=%d free_possible=%d addr=%p size=%lu\n", + state, + free_possible, + smpl_buf_addr, + smpl_buf_size)); + + if (smpl_buf_addr) pfm_exit_smpl_buffer(ctx->ctx_buf_fmt); + + /* + * UNLOADED that the session has already been unreserved. + */ + if (state == PFM_CTX_ZOMBIE) { + pfm_unreserve_session(ctx, ctx->ctx_fl_system , ctx->ctx_cpu); + } + + /* + * disconnect file descriptor from context must be done + * before we unlock. + */ + filp->private_data = NULL; + + /* + * if we free on the spot, the context is now completely unreacheable + * from the callers side. The monitored task side is also cut, so we + * can freely cut. + * + * If we have a deferred free, only the caller side is disconnected. + */ + UNPROTECT_CTX(ctx, flags); + + /* + * All memory free operations (especially for vmalloc'ed memory) + * MUST be done with interrupts ENABLED. + */ + if (smpl_buf_addr) pfm_rvfree(smpl_buf_addr, smpl_buf_size); + + /* + * return the memory used by the context + */ + if (free_possible) pfm_context_free(ctx); + + return 0; +} + +static int +pfm_no_open(struct inode *irrelevant, struct file *dontcare) +{ + DPRINT(("pfm_no_open called\n")); + return -ENXIO; +} + + + +static struct file_operations pfm_file_ops = { + .llseek = no_llseek, + .read = pfm_read, + .write = pfm_write, + .poll = pfm_poll, + .ioctl = pfm_ioctl, + .open = pfm_no_open, /* special open code to disallow open via /proc */ + .fasync = pfm_fasync, + .release = pfm_close, + .flush = pfm_flush +}; + +static int +pfmfs_delete_dentry(struct dentry *dentry) +{ + return 1; +} + +static struct dentry_operations pfmfs_dentry_operations = { + .d_delete = pfmfs_delete_dentry, +}; + + +static int +pfm_alloc_fd(struct file **cfile) +{ + int fd, ret = 0; + struct file *file = NULL; + struct inode * inode; + char name[32]; + struct qstr this; + + fd = get_unused_fd(); + if (fd < 0) return -ENFILE; + + ret = -ENFILE; + + file = get_empty_filp(); + if (!file) goto out; + + /* + * allocate a new inode + */ + inode = new_inode(pfmfs_mnt->mnt_sb); + if (!inode) goto out; + + DPRINT(("new inode ino=%ld @%p\n", inode->i_ino, inode)); + + inode->i_mode = S_IFCHR|S_IRUGO; + inode->i_uid = current->fsuid; + inode->i_gid = current->fsgid; + + sprintf(name, "[%lu]", inode->i_ino); + this.name = name; + this.len = strlen(name); + this.hash = inode->i_ino; + + ret = -ENOMEM; + + /* + * allocate a new dcache entry + */ + file->f_dentry = d_alloc(pfmfs_mnt->mnt_sb->s_root, &this); + if (!file->f_dentry) goto out; + + file->f_dentry->d_op = &pfmfs_dentry_operations; + + d_add(file->f_dentry, inode); + file->f_vfsmnt = mntget(pfmfs_mnt); + file->f_mapping = inode->i_mapping; + + file->f_op = &pfm_file_ops; + file->f_mode = FMODE_READ; + file->f_flags = O_RDONLY; + file->f_pos = 0; + + /* + * may have to delay until context is attached? + */ + fd_install(fd, file); + + /* + * the file structure we will use + */ + *cfile = file; + + return fd; +out: + if (file) put_filp(file); + put_unused_fd(fd); + return ret; +} + +static void +pfm_free_fd(int fd, struct file *file) +{ + struct files_struct *files = current->files; + + /* + * there ie no fd_uninstall(), so we do it here + */ + spin_lock(&files->file_lock); + files->fd[fd] = NULL; + spin_unlock(&files->file_lock); + + if (file) put_filp(file); + put_unused_fd(fd); +} + +static int +pfm_remap_buffer(struct vm_area_struct *vma, unsigned long buf, unsigned long addr, unsigned long size) +{ + DPRINT(("CPU%d buf=0x%lx addr=0x%lx size=%ld\n", smp_processor_id(), buf, addr, size)); + + while (size > 0) { + unsigned long pfn = ia64_tpa(buf) >> PAGE_SHIFT; + + + if (remap_pfn_range(vma, addr, pfn, PAGE_SIZE, PAGE_READONLY)) + return -ENOMEM; + + addr += PAGE_SIZE; + buf += PAGE_SIZE; + size -= PAGE_SIZE; + } + return 0; +} + +/* + * allocate a sampling buffer and remaps it into the user address space of the task + */ +static int +pfm_smpl_buffer_alloc(struct task_struct *task, pfm_context_t *ctx, unsigned long rsize, void **user_vaddr) +{ + struct mm_struct *mm = task->mm; + struct vm_area_struct *vma = NULL; + unsigned long size; + void *smpl_buf; + + + /* + * the fixed header + requested size and align to page boundary + */ + size = PAGE_ALIGN(rsize); + + DPRINT(("sampling buffer rsize=%lu size=%lu bytes\n", rsize, size)); + + /* + * check requested size to avoid Denial-of-service attacks + * XXX: may have to refine this test + * Check against address space limit. + * + * if ((mm->total_vm << PAGE_SHIFT) + len> task->rlim[RLIMIT_AS].rlim_cur) + * return -ENOMEM; + */ + if (size > task->signal->rlim[RLIMIT_MEMLOCK].rlim_cur) + return -ENOMEM; + + /* + * We do the easy to undo allocations first. + * + * pfm_rvmalloc(), clears the buffer, so there is no leak + */ + smpl_buf = pfm_rvmalloc(size); + if (smpl_buf == NULL) { + DPRINT(("Can't allocate sampling buffer\n")); + return -ENOMEM; + } + + DPRINT(("smpl_buf @%p\n", smpl_buf)); + + /* allocate vma */ + vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); + if (!vma) { + DPRINT(("Cannot allocate vma\n")); + goto error_kmem; + } + memset(vma, 0, sizeof(*vma)); + + /* + * partially initialize the vma for the sampling buffer + */ + vma->vm_mm = mm; + vma->vm_flags = VM_READ| VM_MAYREAD |VM_RESERVED; + vma->vm_page_prot = PAGE_READONLY; /* XXX may need to change */ + + /* + * Now we have everything we need and we can initialize + * and connect all the data structures + */ + + ctx->ctx_smpl_hdr = smpl_buf; + ctx->ctx_smpl_size = size; /* aligned size */ + + /* + * Let's do the difficult operations next. + * + * now we atomically find some area in the address space and + * remap the buffer in it. + */ + down_write(&task->mm->mmap_sem); + + /* find some free area in address space, must have mmap sem held */ + vma->vm_start = pfm_get_unmapped_area(NULL, 0, size, 0, MAP_PRIVATE|MAP_ANONYMOUS, 0); + if (vma->vm_start == 0UL) { + DPRINT(("Cannot find unmapped area for size %ld\n", size)); + up_write(&task->mm->mmap_sem); + goto error; + } + vma->vm_end = vma->vm_start + size; + vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT; + + DPRINT(("aligned size=%ld, hdr=%p mapped @0x%lx\n", size, ctx->ctx_smpl_hdr, vma->vm_start)); + + /* can only be applied to current task, need to have the mm semaphore held when called */ + if (pfm_remap_buffer(vma, (unsigned long)smpl_buf, vma->vm_start, size)) { + DPRINT(("Can't remap buffer\n")); + up_write(&task->mm->mmap_sem); + goto error; + } + + /* + * now insert the vma in the vm list for the process, must be + * done with mmap lock held + */ + insert_vm_struct(mm, vma); + + mm->total_vm += size >> PAGE_SHIFT; + vm_stat_account(vma); + up_write(&task->mm->mmap_sem); + + /* + * keep track of user level virtual address + */ + ctx->ctx_smpl_vaddr = (void *)vma->vm_start; + *(unsigned long *)user_vaddr = vma->vm_start; + + return 0; + +error: + kmem_cache_free(vm_area_cachep, vma); +error_kmem: + pfm_rvfree(smpl_buf, size); + + return -ENOMEM; +} + +/* + * XXX: do something better here + */ +static int +pfm_bad_permissions(struct task_struct *task) +{ + /* inspired by ptrace_attach() */ + DPRINT(("cur: uid=%d gid=%d task: euid=%d suid=%d uid=%d egid=%d sgid=%d\n", + current->uid, + current->gid, + task->euid, + task->suid, + task->uid, + task->egid, + task->sgid)); + + return ((current->uid != task->euid) + || (current->uid != task->suid) + || (current->uid != task->uid) + || (current->gid != task->egid) + || (current->gid != task->sgid) + || (current->gid != task->gid)) && !capable(CAP_SYS_PTRACE); +} + +static int +pfarg_is_sane(struct task_struct *task, pfarg_context_t *pfx) +{ + int ctx_flags; + + /* valid signal */ + + ctx_flags = pfx->ctx_flags; + + if (ctx_flags & PFM_FL_SYSTEM_WIDE) { + + /* + * cannot block in this mode + */ + if (ctx_flags & PFM_FL_NOTIFY_BLOCK) { + DPRINT(("cannot use blocking mode when in system wide monitoring\n")); + return -EINVAL; + } + } else { + } + /* probably more to add here */ + + return 0; +} + +static int +pfm_setup_buffer_fmt(struct task_struct *task, pfm_context_t *ctx, unsigned int ctx_flags, + unsigned int cpu, pfarg_context_t *arg) +{ + pfm_buffer_fmt_t *fmt = NULL; + unsigned long size = 0UL; + void *uaddr = NULL; + void *fmt_arg = NULL; + int ret = 0; +#define PFM_CTXARG_BUF_ARG(a) (pfm_buffer_fmt_t *)(a+1) + + /* invoke and lock buffer format, if found */ + fmt = pfm_find_buffer_fmt(arg->ctx_smpl_buf_id); + if (fmt == NULL) { + DPRINT(("[%d] cannot find buffer format\n", task->pid)); + return -EINVAL; + } + + /* + * buffer argument MUST be contiguous to pfarg_context_t + */ + if (fmt->fmt_arg_size) fmt_arg = PFM_CTXARG_BUF_ARG(arg); + + ret = pfm_buf_fmt_validate(fmt, task, ctx_flags, cpu, fmt_arg); + + DPRINT(("[%d] after validate(0x%x,%d,%p)=%d\n", task->pid, ctx_flags, cpu, fmt_arg, ret)); + + if (ret) goto error; + + /* link buffer format and context */ + ctx->ctx_buf_fmt = fmt; + + /* + * check if buffer format wants to use perfmon buffer allocation/mapping service + */ + ret = pfm_buf_fmt_getsize(fmt, task, ctx_flags, cpu, fmt_arg, &size); + if (ret) goto error; + + if (size) { + /* + * buffer is always remapped into the caller's address space + */ + ret = pfm_smpl_buffer_alloc(current, ctx, size, &uaddr); + if (ret) goto error; + + /* keep track of user address of buffer */ + arg->ctx_smpl_vaddr = uaddr; + } + ret = pfm_buf_fmt_init(fmt, task, ctx->ctx_smpl_hdr, ctx_flags, cpu, fmt_arg); + +error: + return ret; +} + +static void +pfm_reset_pmu_state(pfm_context_t *ctx) +{ + int i; + + /* + * install reset values for PMC. + */ + for (i=1; PMC_IS_LAST(i) == 0; i++) { + if (PMC_IS_IMPL(i) == 0) continue; + ctx->ctx_pmcs[i] = PMC_DFL_VAL(i); + DPRINT(("pmc[%d]=0x%lx\n", i, ctx->ctx_pmcs[i])); + } + /* + * PMD registers are set to 0UL when the context in memset() + */ + + /* + * On context switched restore, we must restore ALL pmc and ALL pmd even + * when they are not actively used by the task. In UP, the incoming process + * may otherwise pick up left over PMC, PMD state from the previous process. + * As opposed to PMD, stale PMC can cause harm to the incoming + * process because they may change what is being measured. + * Therefore, we must systematically reinstall the entire + * PMC state. In SMP, the same thing is possible on the + * same CPU but also on between 2 CPUs. + * + * The problem with PMD is information leaking especially + * to user level when psr.sp=0 + * + * There is unfortunately no easy way to avoid this problem + * on either UP or SMP. This definitively slows down the + * pfm_load_regs() function. + */ + + /* + * bitmask of all PMCs accessible to this context + * + * PMC0 is treated differently. + */ + ctx->ctx_all_pmcs[0] = pmu_conf->impl_pmcs[0] & ~0x1; + + /* + * bitmask of all PMDs that are accesible to this context + */ + ctx->ctx_all_pmds[0] = pmu_conf->impl_pmds[0]; + + DPRINT(("<%d> all_pmcs=0x%lx all_pmds=0x%lx\n", ctx->ctx_fd, ctx->ctx_all_pmcs[0],ctx->ctx_all_pmds[0])); + + /* + * useful in case of re-enable after disable + */ + ctx->ctx_used_ibrs[0] = 0UL; + ctx->ctx_used_dbrs[0] = 0UL; +} + +static int +pfm_ctx_getsize(void *arg, size_t *sz) +{ + pfarg_context_t *req = (pfarg_context_t *)arg; + pfm_buffer_fmt_t *fmt; + + *sz = 0; + + if (!pfm_uuid_cmp(req->ctx_smpl_buf_id, pfm_null_uuid)) return 0; + + fmt = pfm_find_buffer_fmt(req->ctx_smpl_buf_id); + if (fmt == NULL) { + DPRINT(("cannot find buffer format\n")); + return -EINVAL; + } + /* get just enough to copy in user parameters */ + *sz = fmt->fmt_arg_size; + DPRINT(("arg_size=%lu\n", *sz)); + + return 0; +} + + + +/* + * cannot attach if : + * - kernel task + * - task not owned by caller + * - task incompatible with context mode + */ +static int +pfm_task_incompatible(pfm_context_t *ctx, struct task_struct *task) +{ + /* + * no kernel task or task not owner by caller + */ + if (task->mm == NULL) { + DPRINT(("task [%d] has not memory context (kernel thread)\n", task->pid)); + return -EPERM; + } + if (pfm_bad_permissions(task)) { + DPRINT(("no permission to attach to [%d]\n", task->pid)); + return -EPERM; + } + /* + * cannot block in self-monitoring mode + */ + if (CTX_OVFL_NOBLOCK(ctx) == 0 && task == current) { + DPRINT(("cannot load a blocking context on self for [%d]\n", task->pid)); + return -EINVAL; + } + + if (task->exit_state == EXIT_ZOMBIE) { + DPRINT(("cannot attach to zombie task [%d]\n", task->pid)); + return -EBUSY; + } + + /* + * always ok for self + */ + if (task == current) return 0; + + if ((task->state != TASK_STOPPED) && (task->state != TASK_TRACED)) { + DPRINT(("cannot attach to non-stopped task [%d] state=%ld\n", task->pid, task->state)); + return -EBUSY; + } + /* + * make sure the task is off any CPU + */ + wait_task_inactive(task); + + /* more to come... */ + + return 0; +} + +static int +pfm_get_task(pfm_context_t *ctx, pid_t pid, struct task_struct **task) +{ + struct task_struct *p = current; + int ret; + + /* XXX: need to add more checks here */ + if (pid < 2) return -EPERM; + + if (pid != current->pid) { + + read_lock(&tasklist_lock); + + p = find_task_by_pid(pid); + + /* make sure task cannot go away while we operate on it */ + if (p) get_task_struct(p); + + read_unlock(&tasklist_lock); + + if (p == NULL) return -ESRCH; + } + + ret = pfm_task_incompatible(ctx, p); + if (ret == 0) { + *task = p; + } else if (p != current) { + pfm_put_task(p); + } + return ret; +} + + + +static int +pfm_context_create(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs) +{ + pfarg_context_t *req = (pfarg_context_t *)arg; + struct file *filp; + int ctx_flags; + int ret; + + /* let's check the arguments first */ + ret = pfarg_is_sane(current, req); + if (ret < 0) return ret; + + ctx_flags = req->ctx_flags; + + ret = -ENOMEM; + + ctx = pfm_context_alloc(); + if (!ctx) goto error; + + ret = pfm_alloc_fd(&filp); + if (ret < 0) goto error_file; + + req->ctx_fd = ctx->ctx_fd = ret; + + /* + * attach context to file + */ + filp->private_data = ctx; + + /* + * does the user want to sample? + */ + if (pfm_uuid_cmp(req->ctx_smpl_buf_id, pfm_null_uuid)) { + ret = pfm_setup_buffer_fmt(current, ctx, ctx_flags, 0, req); + if (ret) goto buffer_error; + } + + /* + * init context protection lock + */ + spin_lock_init(&ctx->ctx_lock); + + /* + * context is unloaded + */ + ctx->ctx_state = PFM_CTX_UNLOADED; + + /* + * initialization of context's flags + */ + ctx->ctx_fl_block = (ctx_flags & PFM_FL_NOTIFY_BLOCK) ? 1 : 0; + ctx->ctx_fl_system = (ctx_flags & PFM_FL_SYSTEM_WIDE) ? 1: 0; + ctx->ctx_fl_is_sampling = ctx->ctx_buf_fmt ? 1 : 0; /* assume record() is defined */ + ctx->ctx_fl_no_msg = (ctx_flags & PFM_FL_OVFL_NO_MSG) ? 1: 0; + /* + * will move to set properties + * ctx->ctx_fl_excl_idle = (ctx_flags & PFM_FL_EXCL_IDLE) ? 1: 0; + */ + + /* + * init restart semaphore to locked + */ + sema_init(&ctx->ctx_restart_sem, 0); + + /* + * activation is used in SMP only + */ + ctx->ctx_last_activation = PFM_INVALID_ACTIVATION; + SET_LAST_CPU(ctx, -1); + + /* + * initialize notification message queue + */ + ctx->ctx_msgq_head = ctx->ctx_msgq_tail = 0; + init_waitqueue_head(&ctx->ctx_msgq_wait); + init_waitqueue_head(&ctx->ctx_zombieq); + + DPRINT(("ctx=%p flags=0x%x system=%d notify_block=%d excl_idle=%d no_msg=%d ctx_fd=%d \n", + ctx, + ctx_flags, + ctx->ctx_fl_system, + ctx->ctx_fl_block, + ctx->ctx_fl_excl_idle, + ctx->ctx_fl_no_msg, + ctx->ctx_fd)); + + /* + * initialize soft PMU state + */ + pfm_reset_pmu_state(ctx); + + return 0; + +buffer_error: + pfm_free_fd(ctx->ctx_fd, filp); + + if (ctx->ctx_buf_fmt) { + pfm_buf_fmt_exit(ctx->ctx_buf_fmt, current, NULL, regs); + } +error_file: + pfm_context_free(ctx); + +error: + return ret; +} + +static inline unsigned long +pfm_new_counter_value (pfm_counter_t *reg, int is_long_reset) +{ + unsigned long val = is_long_reset ? reg->long_reset : reg->short_reset; + unsigned long new_seed, old_seed = reg->seed, mask = reg->mask; + extern unsigned long carta_random32 (unsigned long seed); + + if (reg->flags & PFM_REGFL_RANDOM) { + new_seed = carta_random32(old_seed); + val -= (old_seed & mask); /* counter values are negative numbers! */ + if ((mask >> 32) != 0) + /* construct a full 64-bit random value: */ + new_seed |= carta_random32(old_seed >> 32) << 32; + reg->seed = new_seed; + } + reg->lval = val; + return val; +} + +static void +pfm_reset_regs_masked(pfm_context_t *ctx, unsigned long *ovfl_regs, int is_long_reset) +{ + unsigned long mask = ovfl_regs[0]; + unsigned long reset_others = 0UL; + unsigned long val; + int i; + + /* + * now restore reset value on sampling overflowed counters + */ + mask >>= PMU_FIRST_COUNTER; + for(i = PMU_FIRST_COUNTER; mask; i++, mask >>= 1) { + + if ((mask & 0x1UL) == 0UL) continue; + + ctx->ctx_pmds[i].val = val = pfm_new_counter_value(ctx->ctx_pmds+ i, is_long_reset); + reset_others |= ctx->ctx_pmds[i].reset_pmds[0]; + + DPRINT_ovfl((" %s reset ctx_pmds[%d]=%lx\n", is_long_reset ? "long" : "short", i, val)); + } + + /* + * Now take care of resetting the other registers + */ + for(i = 0; reset_others; i++, reset_others >>= 1) { + + if ((reset_others & 0x1) == 0) continue; + + ctx->ctx_pmds[i].val = val = pfm_new_counter_value(ctx->ctx_pmds + i, is_long_reset); + + DPRINT_ovfl(("%s reset_others pmd[%d]=%lx\n", + is_long_reset ? "long" : "short", i, val)); + } +} + +static void +pfm_reset_regs(pfm_context_t *ctx, unsigned long *ovfl_regs, int is_long_reset) +{ + unsigned long mask = ovfl_regs[0]; + unsigned long reset_others = 0UL; + unsigned long val; + int i; + + DPRINT_ovfl(("ovfl_regs=0x%lx is_long_reset=%d\n", ovfl_regs[0], is_long_reset)); + + if (ctx->ctx_state == PFM_CTX_MASKED) { + pfm_reset_regs_masked(ctx, ovfl_regs, is_long_reset); + return; + } + + /* + * now restore reset value on sampling overflowed counters + */ + mask >>= PMU_FIRST_COUNTER; + for(i = PMU_FIRST_COUNTER; mask; i++, mask >>= 1) { + + if ((mask & 0x1UL) == 0UL) continue; + + val = pfm_new_counter_value(ctx->ctx_pmds+ i, is_long_reset); + reset_others |= ctx->ctx_pmds[i].reset_pmds[0]; + + DPRINT_ovfl((" %s reset ctx_pmds[%d]=%lx\n", is_long_reset ? "long" : "short", i, val)); + + pfm_write_soft_counter(ctx, i, val); + } + + /* + * Now take care of resetting the other registers + */ + for(i = 0; reset_others; i++, reset_others >>= 1) { + + if ((reset_others & 0x1) == 0) continue; + + val = pfm_new_counter_value(ctx->ctx_pmds + i, is_long_reset); + + if (PMD_IS_COUNTING(i)) { + pfm_write_soft_counter(ctx, i, val); + } else { + ia64_set_pmd(i, val); + } + DPRINT_ovfl(("%s reset_others pmd[%d]=%lx\n", + is_long_reset ? "long" : "short", i, val)); + } + ia64_srlz_d(); +} + +static int +pfm_write_pmcs(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs) +{ + struct thread_struct *thread = NULL; + struct task_struct *task; + pfarg_reg_t *req = (pfarg_reg_t *)arg; + unsigned long value, pmc_pm; + unsigned long smpl_pmds, reset_pmds, impl_pmds; + unsigned int cnum, reg_flags, flags, pmc_type; + int i, can_access_pmu = 0, is_loaded, is_system, expert_mode; + int is_monitor, is_counting, state; + int ret = -EINVAL; + pfm_reg_check_t wr_func; +#define PFM_CHECK_PMC_PM(x, y, z) ((x)->ctx_fl_system ^ PMC_PM(y, z)) + + state = ctx->ctx_state; + is_loaded = state == PFM_CTX_LOADED ? 1 : 0; + is_system = ctx->ctx_fl_system; + task = ctx->ctx_task; + impl_pmds = pmu_conf->impl_pmds[0]; + + if (state == PFM_CTX_ZOMBIE) return -EINVAL; + + if (is_loaded) { + thread = &task->thread; + /* + * In system wide and when the context is loaded, access can only happen + * when the caller is running on the CPU being monitored by the session. + * It does not have to be the owner (ctx_task) of the context per se. + */ + if (is_system && ctx->ctx_cpu != smp_processor_id()) { + DPRINT(("should be running on CPU%d\n", ctx->ctx_cpu)); + return -EBUSY; + } + can_access_pmu = GET_PMU_OWNER() == task || is_system ? 1 : 0; + } + expert_mode = pfm_sysctl.expert_mode; + + for (i = 0; i < count; i++, req++) { + + cnum = req->reg_num; + reg_flags = req->reg_flags; + value = req->reg_value; + smpl_pmds = req->reg_smpl_pmds[0]; + reset_pmds = req->reg_reset_pmds[0]; + flags = 0; + + + if (cnum >= PMU_MAX_PMCS) { + DPRINT(("pmc%u is invalid\n", cnum)); + goto error; + } + + pmc_type = pmu_conf->pmc_desc[cnum].type; + pmc_pm = (value >> pmu_conf->pmc_desc[cnum].pm_pos) & 0x1; + is_counting = (pmc_type & PFM_REG_COUNTING) == PFM_REG_COUNTING ? 1 : 0; + is_monitor = (pmc_type & PFM_REG_MONITOR) == PFM_REG_MONITOR ? 1 : 0; + + /* + * we reject all non implemented PMC as well + * as attempts to modify PMC[0-3] which are used + * as status registers by the PMU + */ + if ((pmc_type & PFM_REG_IMPL) == 0 || (pmc_type & PFM_REG_CONTROL) == PFM_REG_CONTROL) { + DPRINT(("pmc%u is unimplemented or no-access pmc_type=%x\n", cnum, pmc_type)); + goto error; + } + wr_func = pmu_conf->pmc_desc[cnum].write_check; + /* + * If the PMC is a monitor, then if the value is not the default: + * - system-wide session: PMCx.pm=1 (privileged monitor) + * - per-task : PMCx.pm=0 (user monitor) + */ + if (is_monitor && value != PMC_DFL_VAL(cnum) && is_system ^ pmc_pm) { + DPRINT(("pmc%u pmc_pm=%lu is_system=%d\n", + cnum, + pmc_pm, + is_system)); + goto error; + } + + if (is_counting) { + /* + * enforce generation of overflow interrupt. Necessary on all + * CPUs. + */ + value |= 1 << PMU_PMC_OI; + + if (reg_flags & PFM_REGFL_OVFL_NOTIFY) { + flags |= PFM_REGFL_OVFL_NOTIFY; + } + + if (reg_flags & PFM_REGFL_RANDOM) flags |= PFM_REGFL_RANDOM; + + /* verify validity of smpl_pmds */ + if ((smpl_pmds & impl_pmds) != smpl_pmds) { + DPRINT(("invalid smpl_pmds 0x%lx for pmc%u\n", smpl_pmds, cnum)); + goto error; + } + + /* verify validity of reset_pmds */ + if ((reset_pmds & impl_pmds) != reset_pmds) { + DPRINT(("invalid reset_pmds 0x%lx for pmc%u\n", reset_pmds, cnum)); + goto error; + } + } else { + if (reg_flags & (PFM_REGFL_OVFL_NOTIFY|PFM_REGFL_RANDOM)) { + DPRINT(("cannot set ovfl_notify or random on pmc%u\n", cnum)); + goto error; + } + /* eventid on non-counting monitors are ignored */ + } + + /* + * execute write checker, if any + */ + if (likely(expert_mode == 0 && wr_func)) { + ret = (*wr_func)(task, ctx, cnum, &value, regs); + if (ret) goto error; + ret = -EINVAL; + } + + /* + * no error on this register + */ + PFM_REG_RETFLAG_SET(req->reg_flags, 0); + + /* + * Now we commit the changes to the software state + */ + + /* + * update overflow information + */ + if (is_counting) { + /* + * full flag update each time a register is programmed + */ + ctx->ctx_pmds[cnum].flags = flags; + + ctx->ctx_pmds[cnum].reset_pmds[0] = reset_pmds; + ctx->ctx_pmds[cnum].smpl_pmds[0] = smpl_pmds; + ctx->ctx_pmds[cnum].eventid = req->reg_smpl_eventid; + + /* + * Mark all PMDS to be accessed as used. + * + * We do not keep track of PMC because we have to + * systematically restore ALL of them. + * + * We do not update the used_monitors mask, because + * if we have not programmed them, then will be in + * a quiescent state, therefore we will not need to + * mask/restore then when context is MASKED. + */ + CTX_USED_PMD(ctx, reset_pmds); + CTX_USED_PMD(ctx, smpl_pmds); + /* + * make sure we do not try to reset on + * restart because we have established new values + */ + if (state == PFM_CTX_MASKED) ctx->ctx_ovfl_regs[0] &= ~1UL << cnum; + } + /* + * Needed in case the user does not initialize the equivalent + * PMD. Clearing is done indirectly via pfm_reset_pmu_state() so there is no + * possible leak here. + */ + CTX_USED_PMD(ctx, pmu_conf->pmc_desc[cnum].dep_pmd[0]); + + /* + * keep track of the monitor PMC that we are using. + * we save the value of the pmc in ctx_pmcs[] and if + * the monitoring is not stopped for the context we also + * place it in the saved state area so that it will be + * picked up later by the context switch code. + * + * The value in ctx_pmcs[] can only be changed in pfm_write_pmcs(). + * + * The value in thread->pmcs[] may be modified on overflow, i.e., when + * monitoring needs to be stopped. + */ + if (is_monitor) CTX_USED_MONITOR(ctx, 1UL << cnum); + + /* + * update context state + */ + ctx->ctx_pmcs[cnum] = value; + + if (is_loaded) { + /* + * write thread state + */ + if (is_system == 0) thread->pmcs[cnum] = value; + + /* + * write hardware register if we can + */ + if (can_access_pmu) { + ia64_set_pmc(cnum, value); + } +#ifdef CONFIG_SMP + else { + /* + * per-task SMP only here + * + * we are guaranteed that the task is not running on the other CPU, + * we indicate that this PMD will need to be reloaded if the task + * is rescheduled on the CPU it ran last on. + */ + ctx->ctx_reload_pmcs[0] |= 1UL << cnum; + } +#endif + } + + DPRINT(("pmc[%u]=0x%lx ld=%d apmu=%d flags=0x%x all_pmcs=0x%lx used_pmds=0x%lx eventid=%ld smpl_pmds=0x%lx reset_pmds=0x%lx reloads_pmcs=0x%lx used_monitors=0x%lx ovfl_regs=0x%lx\n", + cnum, + value, + is_loaded, + can_access_pmu, + flags, + ctx->ctx_all_pmcs[0], + ctx->ctx_used_pmds[0], + ctx->ctx_pmds[cnum].eventid, + smpl_pmds, + reset_pmds, + ctx->ctx_reload_pmcs[0], + ctx->ctx_used_monitors[0], + ctx->ctx_ovfl_regs[0])); + } + + /* + * make sure the changes are visible + */ + if (can_access_pmu) ia64_srlz_d(); + + return 0; +error: + PFM_REG_RETFLAG_SET(req->reg_flags, PFM_REG_RETFL_EINVAL); + return ret; +} + +static int +pfm_write_pmds(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs) +{ + struct thread_struct *thread = NULL; + struct task_struct *task; + pfarg_reg_t *req = (pfarg_reg_t *)arg; + unsigned long value, hw_value, ovfl_mask; + unsigned int cnum; + int i, can_access_pmu = 0, state; + int is_counting, is_loaded, is_system, expert_mode; + int ret = -EINVAL; + pfm_reg_check_t wr_func; + + + state = ctx->ctx_state; + is_loaded = state == PFM_CTX_LOADED ? 1 : 0; + is_system = ctx->ctx_fl_system; + ovfl_mask = pmu_conf->ovfl_val; + task = ctx->ctx_task; + + if (unlikely(state == PFM_CTX_ZOMBIE)) return -EINVAL; + + /* + * on both UP and SMP, we can only write to the PMC when the task is + * the owner of the local PMU. + */ + if (likely(is_loaded)) { + thread = &task->thread; + /* + * In system wide and when the context is loaded, access can only happen + * when the caller is running on the CPU being monitored by the session. + * It does not have to be the owner (ctx_task) of the context per se. + */ + if (unlikely(is_system && ctx->ctx_cpu != smp_processor_id())) { + DPRINT(("should be running on CPU%d\n", ctx->ctx_cpu)); + return -EBUSY; + } + can_access_pmu = GET_PMU_OWNER() == task || is_system ? 1 : 0; + } + expert_mode = pfm_sysctl.expert_mode; + + for (i = 0; i < count; i++, req++) { + + cnum = req->reg_num; + value = req->reg_value; + + if (!PMD_IS_IMPL(cnum)) { + DPRINT(("pmd[%u] is unimplemented or invalid\n", cnum)); + goto abort_mission; + } + is_counting = PMD_IS_COUNTING(cnum); + wr_func = pmu_conf->pmd_desc[cnum].write_check; + + /* + * execute write checker, if any + */ + if (unlikely(expert_mode == 0 && wr_func)) { + unsigned long v = value; + + ret = (*wr_func)(task, ctx, cnum, &v, regs); + if (ret) goto abort_mission; + + value = v; + ret = -EINVAL; + } + + /* + * no error on this register + */ + PFM_REG_RETFLAG_SET(req->reg_flags, 0); + + /* + * now commit changes to software state + */ + hw_value = value; + + /* + * update virtualized (64bits) counter + */ + if (is_counting) { + /* + * write context state + */ + ctx->ctx_pmds[cnum].lval = value; + + /* + * when context is load we use the split value + */ + if (is_loaded) { + hw_value = value & ovfl_mask; + value = value & ~ovfl_mask; + } + } + /* + * update reset values (not just for counters) + */ + ctx->ctx_pmds[cnum].long_reset = req->reg_long_reset; + ctx->ctx_pmds[cnum].short_reset = req->reg_short_reset; + + /* + * update randomization parameters (not just for counters) + */ + ctx->ctx_pmds[cnum].seed = req->reg_random_seed; + ctx->ctx_pmds[cnum].mask = req->reg_random_mask; + + /* + * update context value + */ + ctx->ctx_pmds[cnum].val = value; + + /* + * Keep track of what we use + * + * We do not keep track of PMC because we have to + * systematically restore ALL of them. + */ + CTX_USED_PMD(ctx, PMD_PMD_DEP(cnum)); + + /* + * mark this PMD register used as well + */ + CTX_USED_PMD(ctx, RDEP(cnum)); + + /* + * make sure we do not try to reset on + * restart because we have established new values + */ + if (is_counting && state == PFM_CTX_MASKED) { + ctx->ctx_ovfl_regs[0] &= ~1UL << cnum; + } + + if (is_loaded) { + /* + * write thread state + */ + if (is_system == 0) thread->pmds[cnum] = hw_value; + + /* + * write hardware register if we can + */ + if (can_access_pmu) { + ia64_set_pmd(cnum, hw_value); + } else { +#ifdef CONFIG_SMP + /* + * we are guaranteed that the task is not running on the other CPU, + * we indicate that this PMD will need to be reloaded if the task + * is rescheduled on the CPU it ran last on. + */ + ctx->ctx_reload_pmds[0] |= 1UL << cnum; +#endif + } + } + + DPRINT(("pmd[%u]=0x%lx ld=%d apmu=%d, hw_value=0x%lx ctx_pmd=0x%lx short_reset=0x%lx " + "long_reset=0x%lx notify=%c seed=0x%lx mask=0x%lx used_pmds=0x%lx reset_pmds=0x%lx reload_pmds=0x%lx all_pmds=0x%lx ovfl_regs=0x%lx\n", + cnum, + value, + is_loaded, + can_access_pmu, + hw_value, + ctx->ctx_pmds[cnum].val, + ctx->ctx_pmds[cnum].short_reset, + ctx->ctx_pmds[cnum].long_reset, + PMC_OVFL_NOTIFY(ctx, cnum) ? 'Y':'N', + ctx->ctx_pmds[cnum].seed, + ctx->ctx_pmds[cnum].mask, + ctx->ctx_used_pmds[0], + ctx->ctx_pmds[cnum].reset_pmds[0], + ctx->ctx_reload_pmds[0], + ctx->ctx_all_pmds[0], + ctx->ctx_ovfl_regs[0])); + } + + /* + * make changes visible + */ + if (can_access_pmu) ia64_srlz_d(); + + return 0; + +abort_mission: + /* + * for now, we have only one possibility for error + */ + PFM_REG_RETFLAG_SET(req->reg_flags, PFM_REG_RETFL_EINVAL); + return ret; +} + +/* + * By the way of PROTECT_CONTEXT(), interrupts are masked while we are in this function. + * Therefore we know, we do not have to worry about the PMU overflow interrupt. If an + * interrupt is delivered during the call, it will be kept pending until we leave, making + * it appears as if it had been generated at the UNPROTECT_CONTEXT(). At least we are + * guaranteed to return consistent data to the user, it may simply be old. It is not + * trivial to treat the overflow while inside the call because you may end up in + * some module sampling buffer code causing deadlocks. + */ +static int +pfm_read_pmds(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs) +{ + struct thread_struct *thread = NULL; + struct task_struct *task; + unsigned long val = 0UL, lval, ovfl_mask, sval; + pfarg_reg_t *req = (pfarg_reg_t *)arg; + unsigned int cnum, reg_flags = 0; + int i, can_access_pmu = 0, state; + int is_loaded, is_system, is_counting, expert_mode; + int ret = -EINVAL; + pfm_reg_check_t rd_func; + + /* + * access is possible when loaded only for + * self-monitoring tasks or in UP mode + */ + + state = ctx->ctx_state; + is_loaded = state == PFM_CTX_LOADED ? 1 : 0; + is_system = ctx->ctx_fl_system; + ovfl_mask = pmu_conf->ovfl_val; + task = ctx->ctx_task; + + if (state == PFM_CTX_ZOMBIE) return -EINVAL; + + if (likely(is_loaded)) { + thread = &task->thread; + /* + * In system wide and when the context is loaded, access can only happen + * when the caller is running on the CPU being monitored by the session. + * It does not have to be the owner (ctx_task) of the context per se. + */ + if (unlikely(is_system && ctx->ctx_cpu != smp_processor_id())) { + DPRINT(("should be running on CPU%d\n", ctx->ctx_cpu)); + return -EBUSY; + } + /* + * this can be true when not self-monitoring only in UP + */ + can_access_pmu = GET_PMU_OWNER() == task || is_system ? 1 : 0; + + if (can_access_pmu) ia64_srlz_d(); + } + expert_mode = pfm_sysctl.expert_mode; + + DPRINT(("ld=%d apmu=%d ctx_state=%d\n", + is_loaded, + can_access_pmu, + state)); + + /* + * on both UP and SMP, we can only read the PMD from the hardware register when + * the task is the owner of the local PMU. + */ + + for (i = 0; i < count; i++, req++) { + + cnum = req->reg_num; + reg_flags = req->reg_flags; + + if (unlikely(!PMD_IS_IMPL(cnum))) goto error; + /* + * we can only read the register that we use. That includes + * the one we explicitely initialize AND the one we want included + * in the sampling buffer (smpl_regs). + * + * Having this restriction allows optimization in the ctxsw routine + * without compromising security (leaks) + */ + if (unlikely(!CTX_IS_USED_PMD(ctx, cnum))) goto error; + + sval = ctx->ctx_pmds[cnum].val; + lval = ctx->ctx_pmds[cnum].lval; + is_counting = PMD_IS_COUNTING(cnum); + + /* + * If the task is not the current one, then we check if the + * PMU state is still in the local live register due to lazy ctxsw. + * If true, then we read directly from the registers. + */ + if (can_access_pmu){ + val = ia64_get_pmd(cnum); + } else { + /* + * context has been saved + * if context is zombie, then task does not exist anymore. + * In this case, we use the full value saved in the context (pfm_flush_regs()). + */ + val = is_loaded ? thread->pmds[cnum] : 0UL; + } + rd_func = pmu_conf->pmd_desc[cnum].read_check; + + if (is_counting) { + /* + * XXX: need to check for overflow when loaded + */ + val &= ovfl_mask; + val += sval; + } + + /* + * execute read checker, if any + */ + if (unlikely(expert_mode == 0 && rd_func)) { + unsigned long v = val; + ret = (*rd_func)(ctx->ctx_task, ctx, cnum, &v, regs); + if (ret) goto error; + val = v; + ret = -EINVAL; + } + + PFM_REG_RETFLAG_SET(reg_flags, 0); + + DPRINT(("pmd[%u]=0x%lx\n", cnum, val)); + + /* + * update register return value, abort all if problem during copy. + * we only modify the reg_flags field. no check mode is fine because + * access has been verified upfront in sys_perfmonctl(). + */ + req->reg_value = val; + req->reg_flags = reg_flags; + req->reg_last_reset_val = lval; + } + + return 0; + +error: + PFM_REG_RETFLAG_SET(req->reg_flags, PFM_REG_RETFL_EINVAL); + return ret; +} + +int +pfm_mod_write_pmcs(struct task_struct *task, void *req, unsigned int nreq, struct pt_regs *regs) +{ + pfm_context_t *ctx; + + if (req == NULL) return -EINVAL; + + ctx = GET_PMU_CTX(); + + if (ctx == NULL) return -EINVAL; + + /* + * for now limit to current task, which is enough when calling + * from overflow handler + */ + if (task != current && ctx->ctx_fl_system == 0) return -EBUSY; + + return pfm_write_pmcs(ctx, req, nreq, regs); +} +EXPORT_SYMBOL(pfm_mod_write_pmcs); + +int +pfm_mod_read_pmds(struct task_struct *task, void *req, unsigned int nreq, struct pt_regs *regs) +{ + pfm_context_t *ctx; + + if (req == NULL) return -EINVAL; + + ctx = GET_PMU_CTX(); + + if (ctx == NULL) return -EINVAL; + + /* + * for now limit to current task, which is enough when calling + * from overflow handler + */ + if (task != current && ctx->ctx_fl_system == 0) return -EBUSY; + + return pfm_read_pmds(ctx, req, nreq, regs); +} +EXPORT_SYMBOL(pfm_mod_read_pmds); + +/* + * Only call this function when a process it trying to + * write the debug registers (reading is always allowed) + */ +int +pfm_use_debug_registers(struct task_struct *task) +{ + pfm_context_t *ctx = task->thread.pfm_context; + unsigned long flags; + int ret = 0; + + if (pmu_conf->use_rr_dbregs == 0) return 0; + + DPRINT(("called for [%d]\n", task->pid)); + + /* + * do it only once + */ + if (task->thread.flags & IA64_THREAD_DBG_VALID) return 0; + + /* + * Even on SMP, we do not need to use an atomic here because + * the only way in is via ptrace() and this is possible only when the + * process is stopped. Even in the case where the ctxsw out is not totally + * completed by the time we come here, there is no way the 'stopped' process + * could be in the middle of fiddling with the pfm_write_ibr_dbr() routine. + * So this is always safe. + */ + if (ctx && ctx->ctx_fl_using_dbreg == 1) return -1; + + LOCK_PFS(flags); + + /* + * We cannot allow setting breakpoints when system wide monitoring + * sessions are using the debug registers. + */ + if (pfm_sessions.pfs_sys_use_dbregs> 0) + ret = -1; + else + pfm_sessions.pfs_ptrace_use_dbregs++; + + DPRINT(("ptrace_use_dbregs=%u sys_use_dbregs=%u by [%d] ret = %d\n", + pfm_sessions.pfs_ptrace_use_dbregs, + pfm_sessions.pfs_sys_use_dbregs, + task->pid, ret)); + + UNLOCK_PFS(flags); + + return ret; +} + +/* + * This function is called for every task that exits with the + * IA64_THREAD_DBG_VALID set. This indicates a task which was + * able to use the debug registers for debugging purposes via + * ptrace(). Therefore we know it was not using them for + * perfmormance monitoring, so we only decrement the number + * of "ptraced" debug register users to keep the count up to date + */ +int +pfm_release_debug_registers(struct task_struct *task) +{ + unsigned long flags; + int ret; + + if (pmu_conf->use_rr_dbregs == 0) return 0; + + LOCK_PFS(flags); + if (pfm_sessions.pfs_ptrace_use_dbregs == 0) { + printk(KERN_ERR "perfmon: invalid release for [%d] ptrace_use_dbregs=0\n", task->pid); + ret = -1; + } else { + pfm_sessions.pfs_ptrace_use_dbregs--; + ret = 0; + } + UNLOCK_PFS(flags); + + return ret; +} + +static int +pfm_restart(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs) +{ + struct task_struct *task; + pfm_buffer_fmt_t *fmt; + pfm_ovfl_ctrl_t rst_ctrl; + int state, is_system; + int ret = 0; + + state = ctx->ctx_state; + fmt = ctx->ctx_buf_fmt; + is_system = ctx->ctx_fl_system; + task = PFM_CTX_TASK(ctx); + + switch(state) { + case PFM_CTX_MASKED: + break; + case PFM_CTX_LOADED: + if (CTX_HAS_SMPL(ctx) && fmt->fmt_restart_active) break; + /* fall through */ + case PFM_CTX_UNLOADED: + case PFM_CTX_ZOMBIE: + DPRINT(("invalid state=%d\n", state)); + return -EBUSY; + default: + DPRINT(("state=%d, cannot operate (no active_restart handler)\n", state)); + return -EINVAL; + } + + /* + * In system wide and when the context is loaded, access can only happen + * when the caller is running on the CPU being monitored by the session. + * It does not have to be the owner (ctx_task) of the context per se. + */ + if (is_system && ctx->ctx_cpu != smp_processor_id()) { + DPRINT(("should be running on CPU%d\n", ctx->ctx_cpu)); + return -EBUSY; + } + + /* sanity check */ + if (unlikely(task == NULL)) { + printk(KERN_ERR "perfmon: [%d] pfm_restart no task\n", current->pid); + return -EINVAL; + } + + if (task == current || is_system) { + + fmt = ctx->ctx_buf_fmt; + + DPRINT(("restarting self %d ovfl=0x%lx\n", + task->pid, + ctx->ctx_ovfl_regs[0])); + + if (CTX_HAS_SMPL(ctx)) { + + prefetch(ctx->ctx_smpl_hdr); + + rst_ctrl.bits.mask_monitoring = 0; + rst_ctrl.bits.reset_ovfl_pmds = 0; + + if (state == PFM_CTX_LOADED) + ret = pfm_buf_fmt_restart_active(fmt, task, &rst_ctrl, ctx->ctx_smpl_hdr, regs); + else + ret = pfm_buf_fmt_restart(fmt, task, &rst_ctrl, ctx->ctx_smpl_hdr, regs); + } else { + rst_ctrl.bits.mask_monitoring = 0; + rst_ctrl.bits.reset_ovfl_pmds = 1; + } + + if (ret == 0) { + if (rst_ctrl.bits.reset_ovfl_pmds) + pfm_reset_regs(ctx, ctx->ctx_ovfl_regs, PFM_PMD_LONG_RESET); + + if (rst_ctrl.bits.mask_monitoring == 0) { + DPRINT(("resuming monitoring for [%d]\n", task->pid)); + + if (state == PFM_CTX_MASKED) pfm_restore_monitoring(task); + } else { + DPRINT(("keeping monitoring stopped for [%d]\n", task->pid)); + + // cannot use pfm_stop_monitoring(task, regs); + } + } + /* + * clear overflowed PMD mask to remove any stale information + */ + ctx->ctx_ovfl_regs[0] = 0UL; + + /* + * back to LOADED state + */ + ctx->ctx_state = PFM_CTX_LOADED; + + /* + * XXX: not really useful for self monitoring + */ + ctx->ctx_fl_can_restart = 0; + + return 0; + } + + /* + * restart another task + */ + + /* + * When PFM_CTX_MASKED, we cannot issue a restart before the previous + * one is seen by the task. + */ + if (state == PFM_CTX_MASKED) { + if (ctx->ctx_fl_can_restart == 0) return -EINVAL; + /* + * will prevent subsequent restart before this one is + * seen by other task + */ + ctx->ctx_fl_can_restart = 0; + } + + /* + * if blocking, then post the semaphore is PFM_CTX_MASKED, i.e. + * the task is blocked or on its way to block. That's the normal + * restart path. If the monitoring is not masked, then the task + * can be actively monitoring and we cannot directly intervene. + * Therefore we use the trap mechanism to catch the task and + * force it to reset the buffer/reset PMDs. + * + * if non-blocking, then we ensure that the task will go into + * pfm_handle_work() before returning to user mode. + * + * We cannot explicitely reset another task, it MUST always + * be done by the task itself. This works for system wide because + * the tool that is controlling the session is logically doing + * "self-monitoring". + */ + if (CTX_OVFL_NOBLOCK(ctx) == 0 && state == PFM_CTX_MASKED) { + DPRINT(("unblocking [%d] \n", task->pid)); + up(&ctx->ctx_restart_sem); + } else { + DPRINT(("[%d] armed exit trap\n", task->pid)); + + ctx->ctx_fl_trap_reason = PFM_TRAP_REASON_RESET; + + PFM_SET_WORK_PENDING(task, 1); + + pfm_set_task_notify(task); + + /* + * XXX: send reschedule if task runs on another CPU + */ + } + return 0; +} + +static int +pfm_debug(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs) +{ + unsigned int m = *(unsigned int *)arg; + + pfm_sysctl.debug = m == 0 ? 0 : 1; + + pfm_debug_var = pfm_sysctl.debug; + + printk(KERN_INFO "perfmon debugging %s (timing reset)\n", pfm_sysctl.debug ? "on" : "off"); + + if (m == 0) { + memset(pfm_stats, 0, sizeof(pfm_stats)); + for(m=0; m < NR_CPUS; m++) pfm_stats[m].pfm_ovfl_intr_cycles_min = ~0UL; + } + return 0; +} + +/* + * arg can be NULL and count can be zero for this function + */ +static int +pfm_write_ibr_dbr(int mode, pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs) +{ + struct thread_struct *thread = NULL; + struct task_struct *task; + pfarg_dbreg_t *req = (pfarg_dbreg_t *)arg; + unsigned long flags; + dbreg_t dbreg; + unsigned int rnum; + int first_time; + int ret = 0, state; + int i, can_access_pmu = 0; + int is_system, is_loaded; + + if (pmu_conf->use_rr_dbregs == 0) return -EINVAL; + + state = ctx->ctx_state; + is_loaded = state == PFM_CTX_LOADED ? 1 : 0; + is_system = ctx->ctx_fl_system; + task = ctx->ctx_task; + + if (state == PFM_CTX_ZOMBIE) return -EINVAL; + + /* + * on both UP and SMP, we can only write to the PMC when the task is + * the owner of the local PMU. + */ + if (is_loaded) { + thread = &task->thread; + /* + * In system wide and when the context is loaded, access can only happen + * when the caller is running on the CPU being monitored by the session. + * It does not have to be the owner (ctx_task) of the context per se. + */ + if (unlikely(is_system && ctx->ctx_cpu != smp_processor_id())) { + DPRINT(("should be running on CPU%d\n", ctx->ctx_cpu)); + return -EBUSY; + } + can_access_pmu = GET_PMU_OWNER() == task || is_system ? 1 : 0; + } + + /* + * we do not need to check for ipsr.db because we do clear ibr.x, dbr.r, and dbr.w + * ensuring that no real breakpoint can be installed via this call. + * + * IMPORTANT: regs can be NULL in this function + */ + + first_time = ctx->ctx_fl_using_dbreg == 0; + + /* + * don't bother if we are loaded and task is being debugged + */ + if (is_loaded && (thread->flags & IA64_THREAD_DBG_VALID) != 0) { + DPRINT(("debug registers already in use for [%d]\n", task->pid)); + return -EBUSY; + } + + /* + * check for debug registers in system wide mode + * + * If though a check is done in pfm_context_load(), + * we must repeat it here, in case the registers are + * written after the context is loaded + */ + if (is_loaded) { + LOCK_PFS(flags); + + if (first_time && is_system) { + if (pfm_sessions.pfs_ptrace_use_dbregs) + ret = -EBUSY; + else + pfm_sessions.pfs_sys_use_dbregs++; + } + UNLOCK_PFS(flags); + } + + if (ret != 0) return ret; + + /* + * mark ourself as user of the debug registers for + * perfmon purposes. + */ + ctx->ctx_fl_using_dbreg = 1; + + /* + * clear hardware registers to make sure we don't + * pick up stale state. + * + * for a system wide session, we do not use + * thread.dbr, thread.ibr because this process + * never leaves the current CPU and the state + * is shared by all processes running on it + */ + if (first_time && can_access_pmu) { + DPRINT(("[%d] clearing ibrs, dbrs\n", task->pid)); + for (i=0; i < pmu_conf->num_ibrs; i++) { + ia64_set_ibr(i, 0UL); + ia64_dv_serialize_instruction(); + } + ia64_srlz_i(); + for (i=0; i < pmu_conf->num_dbrs; i++) { + ia64_set_dbr(i, 0UL); + ia64_dv_serialize_data(); + } + ia64_srlz_d(); + } + + /* + * Now install the values into the registers + */ + for (i = 0; i < count; i++, req++) { + + rnum = req->dbreg_num; + dbreg.val = req->dbreg_value; + + ret = -EINVAL; + + if ((mode == PFM_CODE_RR && rnum >= PFM_NUM_IBRS) || ((mode == PFM_DATA_RR) && rnum >= PFM_NUM_DBRS)) { + DPRINT(("invalid register %u val=0x%lx mode=%d i=%d count=%d\n", + rnum, dbreg.val, mode, i, count)); + + goto abort_mission; + } + + /* + * make sure we do not install enabled breakpoint + */ + if (rnum & 0x1) { + if (mode == PFM_CODE_RR) + dbreg.ibr.ibr_x = 0; + else + dbreg.dbr.dbr_r = dbreg.dbr.dbr_w = 0; + } + + PFM_REG_RETFLAG_SET(req->dbreg_flags, 0); + + /* + * Debug registers, just like PMC, can only be modified + * by a kernel call. Moreover, perfmon() access to those + * registers are centralized in this routine. The hardware + * does not modify the value of these registers, therefore, + * if we save them as they are written, we can avoid having + * to save them on context switch out. This is made possible + * by the fact that when perfmon uses debug registers, ptrace() + * won't be able to modify them concurrently. + */ + if (mode == PFM_CODE_RR) { + CTX_USED_IBR(ctx, rnum); + + if (can_access_pmu) { + ia64_set_ibr(rnum, dbreg.val); + ia64_dv_serialize_instruction(); + } + + ctx->ctx_ibrs[rnum] = dbreg.val; + + DPRINT(("write ibr%u=0x%lx used_ibrs=0x%x ld=%d apmu=%d\n", + rnum, dbreg.val, ctx->ctx_used_ibrs[0], is_loaded, can_access_pmu)); + } else { + CTX_USED_DBR(ctx, rnum); + + if (can_access_pmu) { + ia64_set_dbr(rnum, dbreg.val); + ia64_dv_serialize_data(); + } + ctx->ctx_dbrs[rnum] = dbreg.val; + + DPRINT(("write dbr%u=0x%lx used_dbrs=0x%x ld=%d apmu=%d\n", + rnum, dbreg.val, ctx->ctx_used_dbrs[0], is_loaded, can_access_pmu)); + } + } + + return 0; + +abort_mission: + /* + * in case it was our first attempt, we undo the global modifications + */ + if (first_time) { + LOCK_PFS(flags); + if (ctx->ctx_fl_system) { + pfm_sessions.pfs_sys_use_dbregs--; + } + UNLOCK_PFS(flags); + ctx->ctx_fl_using_dbreg = 0; + } + /* + * install error return flag + */ + PFM_REG_RETFLAG_SET(req->dbreg_flags, PFM_REG_RETFL_EINVAL); + + return ret; +} + +static int +pfm_write_ibrs(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs) +{ + return pfm_write_ibr_dbr(PFM_CODE_RR, ctx, arg, count, regs); +} + +static int +pfm_write_dbrs(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs) +{ + return pfm_write_ibr_dbr(PFM_DATA_RR, ctx, arg, count, regs); +} + +int +pfm_mod_write_ibrs(struct task_struct *task, void *req, unsigned int nreq, struct pt_regs *regs) +{ + pfm_context_t *ctx; + + if (req == NULL) return -EINVAL; + + ctx = GET_PMU_CTX(); + + if (ctx == NULL) return -EINVAL; + + /* + * for now limit to current task, which is enough when calling + * from overflow handler + */ + if (task != current && ctx->ctx_fl_system == 0) return -EBUSY; + + return pfm_write_ibrs(ctx, req, nreq, regs); +} +EXPORT_SYMBOL(pfm_mod_write_ibrs); + +int +pfm_mod_write_dbrs(struct task_struct *task, void *req, unsigned int nreq, struct pt_regs *regs) +{ + pfm_context_t *ctx; + + if (req == NULL) return -EINVAL; + + ctx = GET_PMU_CTX(); + + if (ctx == NULL) return -EINVAL; + + /* + * for now limit to current task, which is enough when calling + * from overflow handler + */ + if (task != current && ctx->ctx_fl_system == 0) return -EBUSY; + + return pfm_write_dbrs(ctx, req, nreq, regs); +} +EXPORT_SYMBOL(pfm_mod_write_dbrs); + + +static int +pfm_get_features(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs) +{ + pfarg_features_t *req = (pfarg_features_t *)arg; + + req->ft_version = PFM_VERSION; + return 0; +} + +static int +pfm_stop(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs) +{ + struct pt_regs *tregs; + struct task_struct *task = PFM_CTX_TASK(ctx); + int state, is_system; + + state = ctx->ctx_state; + is_system = ctx->ctx_fl_system; + + /* + * context must be attached to issue the stop command (includes LOADED,MASKED,ZOMBIE) + */ + if (state == PFM_CTX_UNLOADED) return -EINVAL; + + /* + * In system wide and when the context is loaded, access can only happen + * when the caller is running on the CPU being monitored by the session. + * It does not have to be the owner (ctx_task) of the context per se. + */ + if (is_system && ctx->ctx_cpu != smp_processor_id()) { + DPRINT(("should be running on CPU%d\n", ctx->ctx_cpu)); + return -EBUSY; + } + DPRINT(("task [%d] ctx_state=%d is_system=%d\n", + PFM_CTX_TASK(ctx)->pid, + state, + is_system)); + /* + * in system mode, we need to update the PMU directly + * and the user level state of the caller, which may not + * necessarily be the creator of the context. + */ + if (is_system) { + /* + * Update local PMU first + * + * disable dcr pp + */ + ia64_setreg(_IA64_REG_CR_DCR, ia64_getreg(_IA64_REG_CR_DCR) & ~IA64_DCR_PP); + ia64_srlz_i(); + + /* + * update local cpuinfo + */ + PFM_CPUINFO_CLEAR(PFM_CPUINFO_DCR_PP); + + /* + * stop monitoring, does srlz.i + */ + pfm_clear_psr_pp(); + + /* + * stop monitoring in the caller + */ + ia64_psr(regs)->pp = 0; + + return 0; + } + /* + * per-task mode + */ + + if (task == current) { + /* stop monitoring at kernel level */ + pfm_clear_psr_up(); + + /* + * stop monitoring at the user level + */ + ia64_psr(regs)->up = 0; + } else { + tregs = ia64_task_regs(task); + + /* + * stop monitoring at the user level + */ + ia64_psr(tregs)->up = 0; + + /* + * monitoring disabled in kernel at next reschedule + */ + ctx->ctx_saved_psr_up = 0; + DPRINT(("task=[%d]\n", task->pid)); + } + return 0; +} + + +static int +pfm_start(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs) +{ + struct pt_regs *tregs; + int state, is_system; + + state = ctx->ctx_state; + is_system = ctx->ctx_fl_system; + + if (state != PFM_CTX_LOADED) return -EINVAL; + + /* + * In system wide and when the context is loaded, access can only happen + * when the caller is running on the CPU being monitored by the session. + * It does not have to be the owner (ctx_task) of the context per se. + */ + if (is_system && ctx->ctx_cpu != smp_processor_id()) { + DPRINT(("should be running on CPU%d\n", ctx->ctx_cpu)); + return -EBUSY; + } + + /* + * in system mode, we need to update the PMU directly + * and the user level state of the caller, which may not + * necessarily be the creator of the context. + */ + if (is_system) { + + /* + * set user level psr.pp for the caller + */ + ia64_psr(regs)->pp = 1; + + /* + * now update the local PMU and cpuinfo + */ + PFM_CPUINFO_SET(PFM_CPUINFO_DCR_PP); + + /* + * start monitoring at kernel level + */ + pfm_set_psr_pp(); + + /* enable dcr pp */ + ia64_setreg(_IA64_REG_CR_DCR, ia64_getreg(_IA64_REG_CR_DCR) | IA64_DCR_PP); + ia64_srlz_i(); + + return 0; + } + + /* + * per-process mode + */ + + if (ctx->ctx_task == current) { + + /* start monitoring at kernel level */ + pfm_set_psr_up(); + + /* + * activate monitoring at user level + */ + ia64_psr(regs)->up = 1; + + } else { + tregs = ia64_task_regs(ctx->ctx_task); + + /* + * start monitoring at the kernel level the next + * time the task is scheduled + */ + ctx->ctx_saved_psr_up = IA64_PSR_UP; + + /* + * activate monitoring at user level + */ + ia64_psr(tregs)->up = 1; + } + return 0; +} + +static int +pfm_get_pmc_reset(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs) +{ + pfarg_reg_t *req = (pfarg_reg_t *)arg; + unsigned int cnum; + int i; + int ret = -EINVAL; + + for (i = 0; i < count; i++, req++) { + + cnum = req->reg_num; + + if (!PMC_IS_IMPL(cnum)) goto abort_mission; + + req->reg_value = PMC_DFL_VAL(cnum); + + PFM_REG_RETFLAG_SET(req->reg_flags, 0); + + DPRINT(("pmc_reset_val pmc[%u]=0x%lx\n", cnum, req->reg_value)); + } + return 0; + +abort_mission: + PFM_REG_RETFLAG_SET(req->reg_flags, PFM_REG_RETFL_EINVAL); + return ret; +} + +static int +pfm_check_task_exist(pfm_context_t *ctx) +{ + struct task_struct *g, *t; + int ret = -ESRCH; + + read_lock(&tasklist_lock); + + do_each_thread (g, t) { + if (t->thread.pfm_context == ctx) { + ret = 0; + break; + } + } while_each_thread (g, t); + + read_unlock(&tasklist_lock); + + DPRINT(("pfm_check_task_exist: ret=%d ctx=%p\n", ret, ctx)); + + return ret; +} + +static int +pfm_context_load(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs) +{ + struct task_struct *task; + struct thread_struct *thread; + struct pfm_context_t *old; + unsigned long flags; +#ifndef CONFIG_SMP + struct task_struct *owner_task = NULL; +#endif + pfarg_load_t *req = (pfarg_load_t *)arg; + unsigned long *pmcs_source, *pmds_source; + int the_cpu; + int ret = 0; + int state, is_system, set_dbregs = 0; + + state = ctx->ctx_state; + is_system = ctx->ctx_fl_system; + /* + * can only load from unloaded or terminated state + */ + if (state != PFM_CTX_UNLOADED) { + DPRINT(("cannot load to [%d], invalid ctx_state=%d\n", + req->load_pid, + ctx->ctx_state)); + return -EINVAL; + } + + DPRINT(("load_pid [%d] using_dbreg=%d\n", req->load_pid, ctx->ctx_fl_using_dbreg)); + + if (CTX_OVFL_NOBLOCK(ctx) == 0 && req->load_pid == current->pid) { + DPRINT(("cannot use blocking mode on self\n")); + return -EINVAL; + } + + ret = pfm_get_task(ctx, req->load_pid, &task); + if (ret) { + DPRINT(("load_pid [%d] get_task=%d\n", req->load_pid, ret)); + return ret; + } + + ret = -EINVAL; + + /* + * system wide is self monitoring only + */ + if (is_system && task != current) { + DPRINT(("system wide is self monitoring only load_pid=%d\n", + req->load_pid)); + goto error; + } + + thread = &task->thread; + + ret = 0; + /* + * cannot load a context which is using range restrictions, + * into a task that is being debugged. + */ + if (ctx->ctx_fl_using_dbreg) { + if (thread->flags & IA64_THREAD_DBG_VALID) { + ret = -EBUSY; + DPRINT(("load_pid [%d] task is debugged, cannot load range restrictions\n", req->load_pid)); + goto error; + } + LOCK_PFS(flags); + + if (is_system) { + if (pfm_sessions.pfs_ptrace_use_dbregs) { + DPRINT(("cannot load [%d] dbregs in use\n", task->pid)); + ret = -EBUSY; + } else { + pfm_sessions.pfs_sys_use_dbregs++; + DPRINT(("load [%d] increased sys_use_dbreg=%u\n", task->pid, pfm_sessions.pfs_sys_use_dbregs)); + set_dbregs = 1; + } + } + + UNLOCK_PFS(flags); + + if (ret) goto error; + } + + /* + * SMP system-wide monitoring implies self-monitoring. + * + * The programming model expects the task to + * be pinned on a CPU throughout the session. + * Here we take note of the current CPU at the + * time the context is loaded. No call from + * another CPU will be allowed. + * + * The pinning via shed_setaffinity() + * must be done by the calling task prior + * to this call. + * + * systemwide: keep track of CPU this session is supposed to run on + */ + the_cpu = ctx->ctx_cpu = smp_processor_id(); + + ret = -EBUSY; + /* + * now reserve the session + */ + ret = pfm_reserve_session(current, is_system, the_cpu); + if (ret) goto error; + + /* + * task is necessarily stopped at this point. + * + * If the previous context was zombie, then it got removed in + * pfm_save_regs(). Therefore we should not see it here. + * If we see a context, then this is an active context + * + * XXX: needs to be atomic + */ + DPRINT(("before cmpxchg() old_ctx=%p new_ctx=%p\n", + thread->pfm_context, ctx)); + + old = ia64_cmpxchg(acq, &thread->pfm_context, NULL, ctx, sizeof(pfm_context_t *)); + if (old != NULL) { + DPRINT(("load_pid [%d] already has a context\n", req->load_pid)); + goto error_unres; + } + + pfm_reset_msgq(ctx); + + ctx->ctx_state = PFM_CTX_LOADED; + + /* + * link context to task + */ + ctx->ctx_task = task; + + if (is_system) { + /* + * we load as stopped + */ + PFM_CPUINFO_SET(PFM_CPUINFO_SYST_WIDE); + PFM_CPUINFO_CLEAR(PFM_CPUINFO_DCR_PP); + + if (ctx->ctx_fl_excl_idle) PFM_CPUINFO_SET(PFM_CPUINFO_EXCL_IDLE); + } else { + thread->flags |= IA64_THREAD_PM_VALID; + } + + /* + * propagate into thread-state + */ + pfm_copy_pmds(task, ctx); + pfm_copy_pmcs(task, ctx); + + pmcs_source = thread->pmcs; + pmds_source = thread->pmds; + + /* + * always the case for system-wide + */ + if (task == current) { + + if (is_system == 0) { + + /* allow user level control */ + ia64_psr(regs)->sp = 0; + DPRINT(("clearing psr.sp for [%d]\n", task->pid)); + + SET_LAST_CPU(ctx, smp_processor_id()); + INC_ACTIVATION(); + SET_ACTIVATION(ctx); +#ifndef CONFIG_SMP + /* + * push the other task out, if any + */ + owner_task = GET_PMU_OWNER(); + if (owner_task) pfm_lazy_save_regs(owner_task); +#endif + } + /* + * load all PMD from ctx to PMU (as opposed to thread state) + * restore all PMC from ctx to PMU + */ + pfm_restore_pmds(pmds_source, ctx->ctx_all_pmds[0]); + pfm_restore_pmcs(pmcs_source, ctx->ctx_all_pmcs[0]); + + ctx->ctx_reload_pmcs[0] = 0UL; + ctx->ctx_reload_pmds[0] = 0UL; + + /* + * guaranteed safe by earlier check against DBG_VALID + */ + if (ctx->ctx_fl_using_dbreg) { + pfm_restore_ibrs(ctx->ctx_ibrs, pmu_conf->num_ibrs); + pfm_restore_dbrs(ctx->ctx_dbrs, pmu_conf->num_dbrs); + } + /* + * set new ownership + */ + SET_PMU_OWNER(task, ctx); + + DPRINT(("context loaded on PMU for [%d]\n", task->pid)); + } else { + /* + * when not current, task MUST be stopped, so this is safe + */ + regs = ia64_task_regs(task); + + /* force a full reload */ + ctx->ctx_last_activation = PFM_INVALID_ACTIVATION; + SET_LAST_CPU(ctx, -1); + + /* initial saved psr (stopped) */ + ctx->ctx_saved_psr_up = 0UL; + ia64_psr(regs)->up = ia64_psr(regs)->pp = 0; + } + + ret = 0; + +error_unres: + if (ret) pfm_unreserve_session(ctx, ctx->ctx_fl_system, the_cpu); +error: + /* + * we must undo the dbregs setting (for system-wide) + */ + if (ret && set_dbregs) { + LOCK_PFS(flags); + pfm_sessions.pfs_sys_use_dbregs--; + UNLOCK_PFS(flags); + } + /* + * release task, there is now a link with the context + */ + if (is_system == 0 && task != current) { + pfm_put_task(task); + + if (ret == 0) { + ret = pfm_check_task_exist(ctx); + if (ret) { + ctx->ctx_state = PFM_CTX_UNLOADED; + ctx->ctx_task = NULL; + } + } + } + return ret; +} + +/* + * in this function, we do not need to increase the use count + * for the task via get_task_struct(), because we hold the + * context lock. If the task were to disappear while having + * a context attached, it would go through pfm_exit_thread() + * which also grabs the context lock and would therefore be blocked + * until we are here. + */ +static void pfm_flush_pmds(struct task_struct *, pfm_context_t *ctx); + +static int +pfm_context_unload(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs) +{ + struct task_struct *task = PFM_CTX_TASK(ctx); + struct pt_regs *tregs; + int prev_state, is_system; + int ret; + + DPRINT(("ctx_state=%d task [%d]\n", ctx->ctx_state, task ? task->pid : -1)); + + prev_state = ctx->ctx_state; + is_system = ctx->ctx_fl_system; + + /* + * unload only when necessary + */ + if (prev_state == PFM_CTX_UNLOADED) { + DPRINT(("ctx_state=%d, nothing to do\n", prev_state)); + return 0; + } + + /* + * clear psr and dcr bits + */ + ret = pfm_stop(ctx, NULL, 0, regs); + if (ret) return ret; + + ctx->ctx_state = PFM_CTX_UNLOADED; + + /* + * in system mode, we need to update the PMU directly + * and the user level state of the caller, which may not + * necessarily be the creator of the context. + */ + if (is_system) { + + /* + * Update cpuinfo + * + * local PMU is taken care of in pfm_stop() + */ + PFM_CPUINFO_CLEAR(PFM_CPUINFO_SYST_WIDE); + PFM_CPUINFO_CLEAR(PFM_CPUINFO_EXCL_IDLE); + + /* + * save PMDs in context + * release ownership + */ + pfm_flush_pmds(current, ctx); + + /* + * at this point we are done with the PMU + * so we can unreserve the resource. + */ + if (prev_state != PFM_CTX_ZOMBIE) + pfm_unreserve_session(ctx, 1 , ctx->ctx_cpu); + + /* + * disconnect context from task + */ + task->thread.pfm_context = NULL; + /* + * disconnect task from context + */ + ctx->ctx_task = NULL; + + /* + * There is nothing more to cleanup here. + */ + return 0; + } + + /* + * per-task mode + */ + tregs = task == current ? regs : ia64_task_regs(task); + + if (task == current) { + /* + * cancel user level control + */ + ia64_psr(regs)->sp = 1; + + DPRINT(("setting psr.sp for [%d]\n", task->pid)); + } + /* + * save PMDs to context + * release ownership + */ + pfm_flush_pmds(task, ctx); + + /* + * at this point we are done with the PMU + * so we can unreserve the resource. + * + * when state was ZOMBIE, we have already unreserved. + */ + if (prev_state != PFM_CTX_ZOMBIE) + pfm_unreserve_session(ctx, 0 , ctx->ctx_cpu); + + /* + * reset activation counter and psr + */ + ctx->ctx_last_activation = PFM_INVALID_ACTIVATION; + SET_LAST_CPU(ctx, -1); + + /* + * PMU state will not be restored + */ + task->thread.flags &= ~IA64_THREAD_PM_VALID; + + /* + * break links between context and task + */ + task->thread.pfm_context = NULL; + ctx->ctx_task = NULL; + + PFM_SET_WORK_PENDING(task, 0); + + ctx->ctx_fl_trap_reason = PFM_TRAP_REASON_NONE; + ctx->ctx_fl_can_restart = 0; + ctx->ctx_fl_going_zombie = 0; + + DPRINT(("disconnected [%d] from context\n", task->pid)); + + return 0; +} + + +/* + * called only from exit_thread(): task == current + * we come here only if current has a context attached (loaded or masked) + */ +void +pfm_exit_thread(struct task_struct *task) +{ + pfm_context_t *ctx; + unsigned long flags; + struct pt_regs *regs = ia64_task_regs(task); + int ret, state; + int free_ok = 0; + + ctx = PFM_GET_CTX(task); + + PROTECT_CTX(ctx, flags); + + DPRINT(("state=%d task [%d]\n", ctx->ctx_state, task->pid)); + + state = ctx->ctx_state; + switch(state) { + case PFM_CTX_UNLOADED: + /* + * only comes to thios function if pfm_context is not NULL, i.e., cannot + * be in unloaded state + */ + printk(KERN_ERR "perfmon: pfm_exit_thread [%d] ctx unloaded\n", task->pid); + break; + case PFM_CTX_LOADED: + case PFM_CTX_MASKED: + ret = pfm_context_unload(ctx, NULL, 0, regs); + if (ret) { + printk(KERN_ERR "perfmon: pfm_exit_thread [%d] state=%d unload failed %d\n", task->pid, state, ret); + } + DPRINT(("ctx unloaded for current state was %d\n", state)); + + pfm_end_notify_user(ctx); + break; + case PFM_CTX_ZOMBIE: + ret = pfm_context_unload(ctx, NULL, 0, regs); + if (ret) { + printk(KERN_ERR "perfmon: pfm_exit_thread [%d] state=%d unload failed %d\n", task->pid, state, ret); + } + free_ok = 1; + break; + default: + printk(KERN_ERR "perfmon: pfm_exit_thread [%d] unexpected state=%d\n", task->pid, state); + break; + } + UNPROTECT_CTX(ctx, flags); + + { u64 psr = pfm_get_psr(); + BUG_ON(psr & (IA64_PSR_UP|IA64_PSR_PP)); + BUG_ON(GET_PMU_OWNER()); + BUG_ON(ia64_psr(regs)->up); + BUG_ON(ia64_psr(regs)->pp); + } + + /* + * All memory free operations (especially for vmalloc'ed memory) + * MUST be done with interrupts ENABLED. + */ + if (free_ok) pfm_context_free(ctx); +} + +/* + * functions MUST be listed in the increasing order of their index (see permfon.h) + */ +#define PFM_CMD(name, flags, arg_count, arg_type, getsz) { name, #name, flags, arg_count, sizeof(arg_type), getsz } +#define PFM_CMD_S(name, flags) { name, #name, flags, 0, 0, NULL } +#define PFM_CMD_PCLRWS (PFM_CMD_FD|PFM_CMD_ARG_RW|PFM_CMD_STOP) +#define PFM_CMD_PCLRW (PFM_CMD_FD|PFM_CMD_ARG_RW) +#define PFM_CMD_NONE { NULL, "no-cmd", 0, 0, 0, NULL} + +static pfm_cmd_desc_t pfm_cmd_tab[]={ +/* 0 */PFM_CMD_NONE, +/* 1 */PFM_CMD(pfm_write_pmcs, PFM_CMD_PCLRWS, PFM_CMD_ARG_MANY, pfarg_reg_t, NULL), +/* 2 */PFM_CMD(pfm_write_pmds, PFM_CMD_PCLRWS, PFM_CMD_ARG_MANY, pfarg_reg_t, NULL), +/* 3 */PFM_CMD(pfm_read_pmds, PFM_CMD_PCLRWS, PFM_CMD_ARG_MANY, pfarg_reg_t, NULL), +/* 4 */PFM_CMD_S(pfm_stop, PFM_CMD_PCLRWS), +/* 5 */PFM_CMD_S(pfm_start, PFM_CMD_PCLRWS), +/* 6 */PFM_CMD_NONE, +/* 7 */PFM_CMD_NONE, +/* 8 */PFM_CMD(pfm_context_create, PFM_CMD_ARG_RW, 1, pfarg_context_t, pfm_ctx_getsize), +/* 9 */PFM_CMD_NONE, +/* 10 */PFM_CMD_S(pfm_restart, PFM_CMD_PCLRW), +/* 11 */PFM_CMD_NONE, +/* 12 */PFM_CMD(pfm_get_features, PFM_CMD_ARG_RW, 1, pfarg_features_t, NULL), +/* 13 */PFM_CMD(pfm_debug, 0, 1, unsigned int, NULL), +/* 14 */PFM_CMD_NONE, +/* 15 */PFM_CMD(pfm_get_pmc_reset, PFM_CMD_ARG_RW, PFM_CMD_ARG_MANY, pfarg_reg_t, NULL), +/* 16 */PFM_CMD(pfm_context_load, PFM_CMD_PCLRWS, 1, pfarg_load_t, NULL), +/* 17 */PFM_CMD_S(pfm_context_unload, PFM_CMD_PCLRWS), +/* 18 */PFM_CMD_NONE, +/* 19 */PFM_CMD_NONE, +/* 20 */PFM_CMD_NONE, +/* 21 */PFM_CMD_NONE, +/* 22 */PFM_CMD_NONE, +/* 23 */PFM_CMD_NONE, +/* 24 */PFM_CMD_NONE, +/* 25 */PFM_CMD_NONE, +/* 26 */PFM_CMD_NONE, +/* 27 */PFM_CMD_NONE, +/* 28 */PFM_CMD_NONE, +/* 29 */PFM_CMD_NONE, +/* 30 */PFM_CMD_NONE, +/* 31 */PFM_CMD_NONE, +/* 32 */PFM_CMD(pfm_write_ibrs, PFM_CMD_PCLRWS, PFM_CMD_ARG_MANY, pfarg_dbreg_t, NULL), +/* 33 */PFM_CMD(pfm_write_dbrs, PFM_CMD_PCLRWS, PFM_CMD_ARG_MANY, pfarg_dbreg_t, NULL) +}; +#define PFM_CMD_COUNT (sizeof(pfm_cmd_tab)/sizeof(pfm_cmd_desc_t)) + +static int +pfm_check_task_state(pfm_context_t *ctx, int cmd, unsigned long flags) +{ + struct task_struct *task; + int state, old_state; + +recheck: + state = ctx->ctx_state; + task = ctx->ctx_task; + + if (task == NULL) { + DPRINT(("context %d no task, state=%d\n", ctx->ctx_fd, state)); + return 0; + } + + DPRINT(("context %d state=%d [%d] task_state=%ld must_stop=%d\n", + ctx->ctx_fd, + state, + task->pid, + task->state, PFM_CMD_STOPPED(cmd))); + + /* + * self-monitoring always ok. + * + * for system-wide the caller can either be the creator of the + * context (to one to which the context is attached to) OR + * a task running on the same CPU as the session. + */ + if (task == current || ctx->ctx_fl_system) return 0; + + /* + * if context is UNLOADED we are safe to go + */ + if (state == PFM_CTX_UNLOADED) return 0; + + /* + * no command can operate on a zombie context + */ + if (state == PFM_CTX_ZOMBIE) { + DPRINT(("cmd %d state zombie cannot operate on context\n", cmd)); + return -EINVAL; + } + + /* + * context is LOADED or MASKED. Some commands may need to have + * the task stopped. + * + * We could lift this restriction for UP but it would mean that + * the user has no guarantee the task would not run between + * two successive calls to perfmonctl(). That's probably OK. + * If this user wants to ensure the task does not run, then + * the task must be stopped. + */ + if (PFM_CMD_STOPPED(cmd)) { + if ((task->state != TASK_STOPPED) && (task->state != TASK_TRACED)) { + DPRINT(("[%d] task not in stopped state\n", task->pid)); + return -EBUSY; + } + /* + * task is now stopped, wait for ctxsw out + * + * This is an interesting point in the code. + * We need to unprotect the context because + * the pfm_save_regs() routines needs to grab + * the same lock. There are danger in doing + * this because it leaves a window open for + * another task to get access to the context + * and possibly change its state. The one thing + * that is not possible is for the context to disappear + * because we are protected by the VFS layer, i.e., + * get_fd()/put_fd(). + */ + old_state = state; + + UNPROTECT_CTX(ctx, flags); + + wait_task_inactive(task); + + PROTECT_CTX(ctx, flags); + + /* + * we must recheck to verify if state has changed + */ + if (ctx->ctx_state != old_state) { + DPRINT(("old_state=%d new_state=%d\n", old_state, ctx->ctx_state)); + goto recheck; + } + } + return 0; +} + +/* + * system-call entry point (must return long) + */ +asmlinkage long +sys_perfmonctl (int fd, int cmd, void __user *arg, int count) +{ + struct file *file = NULL; + pfm_context_t *ctx = NULL; + unsigned long flags = 0UL; + void *args_k = NULL; + long ret; /* will expand int return types */ + size_t base_sz, sz, xtra_sz = 0; + int narg, completed_args = 0, call_made = 0, cmd_flags; + int (*func)(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs); + int (*getsize)(void *arg, size_t *sz); +#define PFM_MAX_ARGSIZE 4096 + + /* + * reject any call if perfmon was disabled at initialization + */ + if (unlikely(pmu_conf == NULL)) return -ENOSYS; + + if (unlikely(cmd < 0 || cmd >= PFM_CMD_COUNT)) { + DPRINT(("invalid cmd=%d\n", cmd)); + return -EINVAL; + } + + func = pfm_cmd_tab[cmd].cmd_func; + narg = pfm_cmd_tab[cmd].cmd_narg; + base_sz = pfm_cmd_tab[cmd].cmd_argsize; + getsize = pfm_cmd_tab[cmd].cmd_getsize; + cmd_flags = pfm_cmd_tab[cmd].cmd_flags; + + if (unlikely(func == NULL)) { + DPRINT(("invalid cmd=%d\n", cmd)); + return -EINVAL; + } + + DPRINT(("cmd=%s idx=%d narg=0x%x argsz=%lu count=%d\n", + PFM_CMD_NAME(cmd), + cmd, + narg, + base_sz, + count)); + + /* + * check if number of arguments matches what the command expects + */ + if (unlikely((narg == PFM_CMD_ARG_MANY && count <= 0) || (narg > 0 && narg != count))) + return -EINVAL; + +restart_args: + sz = xtra_sz + base_sz*count; + /* + * limit abuse to min page size + */ + if (unlikely(sz > PFM_MAX_ARGSIZE)) { + printk(KERN_ERR "perfmon: [%d] argument too big %lu\n", current->pid, sz); + return -E2BIG; + } + + /* + * allocate default-sized argument buffer + */ + if (likely(count && args_k == NULL)) { + args_k = kmalloc(PFM_MAX_ARGSIZE, GFP_KERNEL); + if (args_k == NULL) return -ENOMEM; + } + + ret = -EFAULT; + + /* + * copy arguments + * + * assume sz = 0 for command without parameters + */ + if (sz && copy_from_user(args_k, arg, sz)) { + DPRINT(("cannot copy_from_user %lu bytes @%p\n", sz, arg)); + goto error_args; + } + + /* + * check if command supports extra parameters + */ + if (completed_args == 0 && getsize) { + /* + * get extra parameters size (based on main argument) + */ + ret = (*getsize)(args_k, &xtra_sz); + if (ret) goto error_args; + + completed_args = 1; + + DPRINT(("restart_args sz=%lu xtra_sz=%lu\n", sz, xtra_sz)); + + /* retry if necessary */ + if (likely(xtra_sz)) goto restart_args; + } + + if (unlikely((cmd_flags & PFM_CMD_FD) == 0)) goto skip_fd; + + ret = -EBADF; + + file = fget(fd); + if (unlikely(file == NULL)) { + DPRINT(("invalid fd %d\n", fd)); + goto error_args; + } + if (unlikely(PFM_IS_FILE(file) == 0)) { + DPRINT(("fd %d not related to perfmon\n", fd)); + goto error_args; + } + + ctx = (pfm_context_t *)file->private_data; + if (unlikely(ctx == NULL)) { + DPRINT(("no context for fd %d\n", fd)); + goto error_args; + } + prefetch(&ctx->ctx_state); + + PROTECT_CTX(ctx, flags); + + /* + * check task is stopped + */ + ret = pfm_check_task_state(ctx, cmd, flags); + if (unlikely(ret)) goto abort_locked; + +skip_fd: + ret = (*func)(ctx, args_k, count, ia64_task_regs(current)); + + call_made = 1; + +abort_locked: + if (likely(ctx)) { + DPRINT(("context unlocked\n")); + UNPROTECT_CTX(ctx, flags); + fput(file); + } + + /* copy argument back to user, if needed */ + if (call_made && PFM_CMD_RW_ARG(cmd) && copy_to_user(arg, args_k, base_sz*count)) ret = -EFAULT; + +error_args: + if (args_k) kfree(args_k); + + DPRINT(("cmd=%s ret=%ld\n", PFM_CMD_NAME(cmd), ret)); + + return ret; +} + +static void +pfm_resume_after_ovfl(pfm_context_t *ctx, unsigned long ovfl_regs, struct pt_regs *regs) +{ + pfm_buffer_fmt_t *fmt = ctx->ctx_buf_fmt; + pfm_ovfl_ctrl_t rst_ctrl; + int state; + int ret = 0; + + state = ctx->ctx_state; + /* + * Unlock sampling buffer and reset index atomically + * XXX: not really needed when blocking + */ + if (CTX_HAS_SMPL(ctx)) { + + rst_ctrl.bits.mask_monitoring = 0; + rst_ctrl.bits.reset_ovfl_pmds = 0; + + if (state == PFM_CTX_LOADED) + ret = pfm_buf_fmt_restart_active(fmt, current, &rst_ctrl, ctx->ctx_smpl_hdr, regs); + else + ret = pfm_buf_fmt_restart(fmt, current, &rst_ctrl, ctx->ctx_smpl_hdr, regs); + } else { + rst_ctrl.bits.mask_monitoring = 0; + rst_ctrl.bits.reset_ovfl_pmds = 1; + } + + if (ret == 0) { + if (rst_ctrl.bits.reset_ovfl_pmds) { + pfm_reset_regs(ctx, &ovfl_regs, PFM_PMD_LONG_RESET); + } + if (rst_ctrl.bits.mask_monitoring == 0) { + DPRINT(("resuming monitoring\n")); + if (ctx->ctx_state == PFM_CTX_MASKED) pfm_restore_monitoring(current); + } else { + DPRINT(("stopping monitoring\n")); + //pfm_stop_monitoring(current, regs); + } + ctx->ctx_state = PFM_CTX_LOADED; + } +} + +/* + * context MUST BE LOCKED when calling + * can only be called for current + */ +static void +pfm_context_force_terminate(pfm_context_t *ctx, struct pt_regs *regs) +{ + int ret; + + DPRINT(("entering for [%d]\n", current->pid)); + + ret = pfm_context_unload(ctx, NULL, 0, regs); + if (ret) { + printk(KERN_ERR "pfm_context_force_terminate: [%d] unloaded failed with %d\n", current->pid, ret); + } + + /* + * and wakeup controlling task, indicating we are now disconnected + */ + wake_up_interruptible(&ctx->ctx_zombieq); + + /* + * given that context is still locked, the controlling + * task will only get access when we return from + * pfm_handle_work(). + */ +} + +static int pfm_ovfl_notify_user(pfm_context_t *ctx, unsigned long ovfl_pmds); + +void +pfm_handle_work(void) +{ + pfm_context_t *ctx; + struct pt_regs *regs; + unsigned long flags; + unsigned long ovfl_regs; + unsigned int reason; + int ret; + + ctx = PFM_GET_CTX(current); + if (ctx == NULL) { + printk(KERN_ERR "perfmon: [%d] has no PFM context\n", current->pid); + return; + } + + PROTECT_CTX(ctx, flags); + + PFM_SET_WORK_PENDING(current, 0); + + pfm_clear_task_notify(); + + regs = ia64_task_regs(current); + + /* + * extract reason for being here and clear + */ + reason = ctx->ctx_fl_trap_reason; + ctx->ctx_fl_trap_reason = PFM_TRAP_REASON_NONE; + ovfl_regs = ctx->ctx_ovfl_regs[0]; + + DPRINT(("reason=%d state=%d\n", reason, ctx->ctx_state)); + + /* + * must be done before we check for simple-reset mode + */ + if (ctx->ctx_fl_going_zombie || ctx->ctx_state == PFM_CTX_ZOMBIE) goto do_zombie; + + + //if (CTX_OVFL_NOBLOCK(ctx)) goto skip_blocking; + if (reason == PFM_TRAP_REASON_RESET) goto skip_blocking; + + UNPROTECT_CTX(ctx, flags); + + /* + * pfm_handle_work() is currently called with interrupts disabled. + * The down_interruptible call may sleep, therefore we + * must re-enable interrupts to avoid deadlocks. It is + * safe to do so because this function is called ONLY + * when returning to user level (PUStk=1), in which case + * there is no risk of kernel stack overflow due to deep + * interrupt nesting. + */ + BUG_ON(flags & IA64_PSR_I); + local_irq_enable(); + + DPRINT(("before block sleeping\n")); + + /* + * may go through without blocking on SMP systems + * if restart has been received already by the time we call down() + */ + ret = down_interruptible(&ctx->ctx_restart_sem); + + DPRINT(("after block sleeping ret=%d\n", ret)); + + /* + * disable interrupts to restore state we had upon entering + * this function + */ + local_irq_disable(); + + PROTECT_CTX(ctx, flags); + + /* + * we need to read the ovfl_regs only after wake-up + * because we may have had pfm_write_pmds() in between + * and that can changed PMD values and therefore + * ovfl_regs is reset for these new PMD values. + */ + ovfl_regs = ctx->ctx_ovfl_regs[0]; + + if (ctx->ctx_fl_going_zombie) { +do_zombie: + DPRINT(("context is zombie, bailing out\n")); + pfm_context_force_terminate(ctx, regs); + goto nothing_to_do; + } + /* + * in case of interruption of down() we don't restart anything + */ + if (ret < 0) goto nothing_to_do; + +skip_blocking: + pfm_resume_after_ovfl(ctx, ovfl_regs, regs); + ctx->ctx_ovfl_regs[0] = 0UL; + +nothing_to_do: + + UNPROTECT_CTX(ctx, flags); +} + +static int +pfm_notify_user(pfm_context_t *ctx, pfm_msg_t *msg) +{ + if (ctx->ctx_state == PFM_CTX_ZOMBIE) { + DPRINT(("ignoring overflow notification, owner is zombie\n")); + return 0; + } + + DPRINT(("waking up somebody\n")); + + if (msg) wake_up_interruptible(&ctx->ctx_msgq_wait); + + /* + * safe, we are not in intr handler, nor in ctxsw when + * we come here + */ + kill_fasync (&ctx->ctx_async_queue, SIGIO, POLL_IN); + + return 0; +} + +static int +pfm_ovfl_notify_user(pfm_context_t *ctx, unsigned long ovfl_pmds) +{ + pfm_msg_t *msg = NULL; + + if (ctx->ctx_fl_no_msg == 0) { + msg = pfm_get_new_msg(ctx); + if (msg == NULL) { + printk(KERN_ERR "perfmon: pfm_ovfl_notify_user no more notification msgs\n"); + return -1; + } + + msg->pfm_ovfl_msg.msg_type = PFM_MSG_OVFL; + msg->pfm_ovfl_msg.msg_ctx_fd = ctx->ctx_fd; + msg->pfm_ovfl_msg.msg_active_set = 0; + msg->pfm_ovfl_msg.msg_ovfl_pmds[0] = ovfl_pmds; + msg->pfm_ovfl_msg.msg_ovfl_pmds[1] = 0UL; + msg->pfm_ovfl_msg.msg_ovfl_pmds[2] = 0UL; + msg->pfm_ovfl_msg.msg_ovfl_pmds[3] = 0UL; + msg->pfm_ovfl_msg.msg_tstamp = 0UL; + } + + DPRINT(("ovfl msg: msg=%p no_msg=%d fd=%d ovfl_pmds=0x%lx\n", + msg, + ctx->ctx_fl_no_msg, + ctx->ctx_fd, + ovfl_pmds)); + + return pfm_notify_user(ctx, msg); +} + +static int +pfm_end_notify_user(pfm_context_t *ctx) +{ + pfm_msg_t *msg; + + msg = pfm_get_new_msg(ctx); + if (msg == NULL) { + printk(KERN_ERR "perfmon: pfm_end_notify_user no more notification msgs\n"); + return -1; + } + /* no leak */ + memset(msg, 0, sizeof(*msg)); + + msg->pfm_end_msg.msg_type = PFM_MSG_END; + msg->pfm_end_msg.msg_ctx_fd = ctx->ctx_fd; + msg->pfm_ovfl_msg.msg_tstamp = 0UL; + + DPRINT(("end msg: msg=%p no_msg=%d ctx_fd=%d\n", + msg, + ctx->ctx_fl_no_msg, + ctx->ctx_fd)); + + return pfm_notify_user(ctx, msg); +} + +/* + * main overflow processing routine. + * it can be called from the interrupt path or explicitely during the context switch code + */ +static void +pfm_overflow_handler(struct task_struct *task, pfm_context_t *ctx, u64 pmc0, struct pt_regs *regs) +{ + pfm_ovfl_arg_t *ovfl_arg; + unsigned long mask; + unsigned long old_val, ovfl_val, new_val; + unsigned long ovfl_notify = 0UL, ovfl_pmds = 0UL, smpl_pmds = 0UL, reset_pmds; + unsigned long tstamp; + pfm_ovfl_ctrl_t ovfl_ctrl; + unsigned int i, has_smpl; + int must_notify = 0; + + if (unlikely(ctx->ctx_state == PFM_CTX_ZOMBIE)) goto stop_monitoring; + + /* + * sanity test. Should never happen + */ + if (unlikely((pmc0 & 0x1) == 0)) goto sanity_check; + + tstamp = ia64_get_itc(); + mask = pmc0 >> PMU_FIRST_COUNTER; + ovfl_val = pmu_conf->ovfl_val; + has_smpl = CTX_HAS_SMPL(ctx); + + DPRINT_ovfl(("pmc0=0x%lx pid=%d iip=0x%lx, %s " + "used_pmds=0x%lx\n", + pmc0, + task ? task->pid: -1, + (regs ? regs->cr_iip : 0), + CTX_OVFL_NOBLOCK(ctx) ? "nonblocking" : "blocking", + ctx->ctx_used_pmds[0])); + + + /* + * first we update the virtual counters + * assume there was a prior ia64_srlz_d() issued + */ + for (i = PMU_FIRST_COUNTER; mask ; i++, mask >>= 1) { + + /* skip pmd which did not overflow */ + if ((mask & 0x1) == 0) continue; + + /* + * Note that the pmd is not necessarily 0 at this point as qualified events + * may have happened before the PMU was frozen. The residual count is not + * taken into consideration here but will be with any read of the pmd via + * pfm_read_pmds(). + */ + old_val = new_val = ctx->ctx_pmds[i].val; + new_val += 1 + ovfl_val; + ctx->ctx_pmds[i].val = new_val; + + /* + * check for overflow condition + */ + if (likely(old_val > new_val)) { + ovfl_pmds |= 1UL << i; + if (PMC_OVFL_NOTIFY(ctx, i)) ovfl_notify |= 1UL << i; + } + + DPRINT_ovfl(("ctx_pmd[%d].val=0x%lx old_val=0x%lx pmd=0x%lx ovfl_pmds=0x%lx ovfl_notify=0x%lx\n", + i, + new_val, + old_val, + ia64_get_pmd(i) & ovfl_val, + ovfl_pmds, + ovfl_notify)); + } + + /* + * there was no 64-bit overflow, nothing else to do + */ + if (ovfl_pmds == 0UL) return; + + /* + * reset all control bits + */ + ovfl_ctrl.val = 0; + reset_pmds = 0UL; + + /* + * if a sampling format module exists, then we "cache" the overflow by + * calling the module's handler() routine. + */ + if (has_smpl) { + unsigned long start_cycles, end_cycles; + unsigned long pmd_mask; + int j, k, ret = 0; + int this_cpu = smp_processor_id(); + + pmd_mask = ovfl_pmds >> PMU_FIRST_COUNTER; + ovfl_arg = &ctx->ctx_ovfl_arg; + + prefetch(ctx->ctx_smpl_hdr); + + for(i=PMU_FIRST_COUNTER; pmd_mask && ret == 0; i++, pmd_mask >>=1) { + + mask = 1UL << i; + + if ((pmd_mask & 0x1) == 0) continue; + + ovfl_arg->ovfl_pmd = (unsigned char )i; + ovfl_arg->ovfl_notify = ovfl_notify & mask ? 1 : 0; + ovfl_arg->active_set = 0; + ovfl_arg->ovfl_ctrl.val = 0; /* module must fill in all fields */ + ovfl_arg->smpl_pmds[0] = smpl_pmds = ctx->ctx_pmds[i].smpl_pmds[0]; + + ovfl_arg->pmd_value = ctx->ctx_pmds[i].val; + ovfl_arg->pmd_last_reset = ctx->ctx_pmds[i].lval; + ovfl_arg->pmd_eventid = ctx->ctx_pmds[i].eventid; + + /* + * copy values of pmds of interest. Sampling format may copy them + * into sampling buffer. + */ + if (smpl_pmds) { + for(j=0, k=0; smpl_pmds; j++, smpl_pmds >>=1) { + if ((smpl_pmds & 0x1) == 0) continue; + ovfl_arg->smpl_pmds_values[k++] = PMD_IS_COUNTING(j) ? pfm_read_soft_counter(ctx, j) : ia64_get_pmd(j); + DPRINT_ovfl(("smpl_pmd[%d]=pmd%u=0x%lx\n", k-1, j, ovfl_arg->smpl_pmds_values[k-1])); + } + } + + pfm_stats[this_cpu].pfm_smpl_handler_calls++; + + start_cycles = ia64_get_itc(); + + /* + * call custom buffer format record (handler) routine + */ + ret = (*ctx->ctx_buf_fmt->fmt_handler)(task, ctx->ctx_smpl_hdr, ovfl_arg, regs, tstamp); + + end_cycles = ia64_get_itc(); + + /* + * For those controls, we take the union because they have + * an all or nothing behavior. + */ + ovfl_ctrl.bits.notify_user |= ovfl_arg->ovfl_ctrl.bits.notify_user; + ovfl_ctrl.bits.block_task |= ovfl_arg->ovfl_ctrl.bits.block_task; + ovfl_ctrl.bits.mask_monitoring |= ovfl_arg->ovfl_ctrl.bits.mask_monitoring; + /* + * build the bitmask of pmds to reset now + */ + if (ovfl_arg->ovfl_ctrl.bits.reset_ovfl_pmds) reset_pmds |= mask; + + pfm_stats[this_cpu].pfm_smpl_handler_cycles += end_cycles - start_cycles; + } + /* + * when the module cannot handle the rest of the overflows, we abort right here + */ + if (ret && pmd_mask) { + DPRINT(("handler aborts leftover ovfl_pmds=0x%lx\n", + pmd_mask<<PMU_FIRST_COUNTER)); + } + /* + * remove the pmds we reset now from the set of pmds to reset in pfm_restart() + */ + ovfl_pmds &= ~reset_pmds; + } else { + /* + * when no sampling module is used, then the default + * is to notify on overflow if requested by user + */ + ovfl_ctrl.bits.notify_user = ovfl_notify ? 1 : 0; + ovfl_ctrl.bits.block_task = ovfl_notify ? 1 : 0; + ovfl_ctrl.bits.mask_monitoring = ovfl_notify ? 1 : 0; /* XXX: change for saturation */ + ovfl_ctrl.bits.reset_ovfl_pmds = ovfl_notify ? 0 : 1; + /* + * if needed, we reset all overflowed pmds + */ + if (ovfl_notify == 0) reset_pmds = ovfl_pmds; + } + + DPRINT_ovfl(("ovfl_pmds=0x%lx reset_pmds=0x%lx\n", ovfl_pmds, reset_pmds)); + + /* + * reset the requested PMD registers using the short reset values + */ + if (reset_pmds) { + unsigned long bm = reset_pmds; + pfm_reset_regs(ctx, &bm, PFM_PMD_SHORT_RESET); + } + + if (ovfl_notify && ovfl_ctrl.bits.notify_user) { + /* + * keep track of what to reset when unblocking + */ + ctx->ctx_ovfl_regs[0] = ovfl_pmds; + + /* + * check for blocking context + */ + if (CTX_OVFL_NOBLOCK(ctx) == 0 && ovfl_ctrl.bits.block_task) { + + ctx->ctx_fl_trap_reason = PFM_TRAP_REASON_BLOCK; + + /* + * set the perfmon specific checking pending work for the task + */ + PFM_SET_WORK_PENDING(task, 1); + + /* + * when coming from ctxsw, current still points to the + * previous task, therefore we must work with task and not current. + */ + pfm_set_task_notify(task); + } + /* + * defer until state is changed (shorten spin window). the context is locked + * anyway, so the signal receiver would come spin for nothing. + */ + must_notify = 1; + } + + DPRINT_ovfl(("owner [%d] pending=%ld reason=%u ovfl_pmds=0x%lx ovfl_notify=0x%lx masked=%d\n", + GET_PMU_OWNER() ? GET_PMU_OWNER()->pid : -1, + PFM_GET_WORK_PENDING(task), + ctx->ctx_fl_trap_reason, + ovfl_pmds, + ovfl_notify, + ovfl_ctrl.bits.mask_monitoring ? 1 : 0)); + /* + * in case monitoring must be stopped, we toggle the psr bits + */ + if (ovfl_ctrl.bits.mask_monitoring) { + pfm_mask_monitoring(task); + ctx->ctx_state = PFM_CTX_MASKED; + ctx->ctx_fl_can_restart = 1; + } + + /* + * send notification now + */ + if (must_notify) pfm_ovfl_notify_user(ctx, ovfl_notify); + + return; + +sanity_check: + printk(KERN_ERR "perfmon: CPU%d overflow handler [%d] pmc0=0x%lx\n", + smp_processor_id(), + task ? task->pid : -1, + pmc0); + return; + +stop_monitoring: + /* + * in SMP, zombie context is never restored but reclaimed in pfm_load_regs(). + * Moreover, zombies are also reclaimed in pfm_save_regs(). Therefore we can + * come here as zombie only if the task is the current task. In which case, we + * can access the PMU hardware directly. + * + * Note that zombies do have PM_VALID set. So here we do the minimal. + * + * In case the context was zombified it could not be reclaimed at the time + * the monitoring program exited. At this point, the PMU reservation has been + * returned, the sampiing buffer has been freed. We must convert this call + * into a spurious interrupt. However, we must also avoid infinite overflows + * by stopping monitoring for this task. We can only come here for a per-task + * context. All we need to do is to stop monitoring using the psr bits which + * are always task private. By re-enabling secure montioring, we ensure that + * the monitored task will not be able to re-activate monitoring. + * The task will eventually be context switched out, at which point the context + * will be reclaimed (that includes releasing ownership of the PMU). + * + * So there might be a window of time where the number of per-task session is zero + * yet one PMU might have a owner and get at most one overflow interrupt for a zombie + * context. This is safe because if a per-task session comes in, it will push this one + * out and by the virtue on pfm_save_regs(), this one will disappear. If a system wide + * session is force on that CPU, given that we use task pinning, pfm_save_regs() will + * also push our zombie context out. + * + * Overall pretty hairy stuff.... + */ + DPRINT(("ctx is zombie for [%d], converted to spurious\n", task ? task->pid: -1)); + pfm_clear_psr_up(); + ia64_psr(regs)->up = 0; + ia64_psr(regs)->sp = 1; + return; +} + +static int +pfm_do_interrupt_handler(int irq, void *arg, struct pt_regs *regs) +{ + struct task_struct *task; + pfm_context_t *ctx; + unsigned long flags; + u64 pmc0; + int this_cpu = smp_processor_id(); + int retval = 0; + + pfm_stats[this_cpu].pfm_ovfl_intr_count++; + + /* + * srlz.d done before arriving here + */ + pmc0 = ia64_get_pmc(0); + + task = GET_PMU_OWNER(); + ctx = GET_PMU_CTX(); + + /* + * if we have some pending bits set + * assumes : if any PMC0.bit[63-1] is set, then PMC0.fr = 1 + */ + if (PMC0_HAS_OVFL(pmc0) && task) { + /* + * we assume that pmc0.fr is always set here + */ + + /* sanity check */ + if (!ctx) goto report_spurious1; + + if (ctx->ctx_fl_system == 0 && (task->thread.flags & IA64_THREAD_PM_VALID) == 0) + goto report_spurious2; + + PROTECT_CTX_NOPRINT(ctx, flags); + + pfm_overflow_handler(task, ctx, pmc0, regs); + + UNPROTECT_CTX_NOPRINT(ctx, flags); + + } else { + pfm_stats[this_cpu].pfm_spurious_ovfl_intr_count++; + retval = -1; + } + /* + * keep it unfrozen at all times + */ + pfm_unfreeze_pmu(); + + return retval; + +report_spurious1: + printk(KERN_INFO "perfmon: spurious overflow interrupt on CPU%d: process %d has no PFM context\n", + this_cpu, task->pid); + pfm_unfreeze_pmu(); + return -1; +report_spurious2: + printk(KERN_INFO "perfmon: spurious overflow interrupt on CPU%d: process %d, invalid flag\n", + this_cpu, + task->pid); + pfm_unfreeze_pmu(); + return -1; +} + +static irqreturn_t +pfm_interrupt_handler(int irq, void *arg, struct pt_regs *regs) +{ + unsigned long start_cycles, total_cycles; + unsigned long min, max; + int this_cpu; + int ret; + + this_cpu = get_cpu(); + min = pfm_stats[this_cpu].pfm_ovfl_intr_cycles_min; + max = pfm_stats[this_cpu].pfm_ovfl_intr_cycles_max; + + start_cycles = ia64_get_itc(); + + ret = pfm_do_interrupt_handler(irq, arg, regs); + + total_cycles = ia64_get_itc(); + + /* + * don't measure spurious interrupts + */ + if (likely(ret == 0)) { + total_cycles -= start_cycles; + + if (total_cycles < min) pfm_stats[this_cpu].pfm_ovfl_intr_cycles_min = total_cycles; + if (total_cycles > max) pfm_stats[this_cpu].pfm_ovfl_intr_cycles_max = total_cycles; + + pfm_stats[this_cpu].pfm_ovfl_intr_cycles += total_cycles; + } + put_cpu_no_resched(); + return IRQ_HANDLED; +} + +/* + * /proc/perfmon interface, for debug only + */ + +#define PFM_PROC_SHOW_HEADER ((void *)NR_CPUS+1) + +static void * +pfm_proc_start(struct seq_file *m, loff_t *pos) +{ + if (*pos == 0) { + return PFM_PROC_SHOW_HEADER; + } + + while (*pos <= NR_CPUS) { + if (cpu_online(*pos - 1)) { + return (void *)*pos; + } + ++*pos; + } + return NULL; +} + +static void * +pfm_proc_next(struct seq_file *m, void *v, loff_t *pos) +{ + ++*pos; + return pfm_proc_start(m, pos); +} + +static void +pfm_proc_stop(struct seq_file *m, void *v) +{ +} + +static void +pfm_proc_show_header(struct seq_file *m) +{ + struct list_head * pos; + pfm_buffer_fmt_t * entry; + unsigned long flags; + + seq_printf(m, + "perfmon version : %u.%u\n" + "model : %s\n" + "fastctxsw : %s\n" + "expert mode : %s\n" + "ovfl_mask : 0x%lx\n" + "PMU flags : 0x%x\n", + PFM_VERSION_MAJ, PFM_VERSION_MIN, + pmu_conf->pmu_name, + pfm_sysctl.fastctxsw > 0 ? "Yes": "No", + pfm_sysctl.expert_mode > 0 ? "Yes": "No", + pmu_conf->ovfl_val, + pmu_conf->flags); + + LOCK_PFS(flags); + + seq_printf(m, + "proc_sessions : %u\n" + "sys_sessions : %u\n" + "sys_use_dbregs : %u\n" + "ptrace_use_dbregs : %u\n", + pfm_sessions.pfs_task_sessions, + pfm_sessions.pfs_sys_sessions, + pfm_sessions.pfs_sys_use_dbregs, + pfm_sessions.pfs_ptrace_use_dbregs); + + UNLOCK_PFS(flags); + + spin_lock(&pfm_buffer_fmt_lock); + + list_for_each(pos, &pfm_buffer_fmt_list) { + entry = list_entry(pos, pfm_buffer_fmt_t, fmt_list); + seq_printf(m, "format : %02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x %s\n", + entry->fmt_uuid[0], + entry->fmt_uuid[1], + entry->fmt_uuid[2], + entry->fmt_uuid[3], + entry->fmt_uuid[4], + entry->fmt_uuid[5], + entry->fmt_uuid[6], + entry->fmt_uuid[7], + entry->fmt_uuid[8], + entry->fmt_uuid[9], + entry->fmt_uuid[10], + entry->fmt_uuid[11], + entry->fmt_uuid[12], + entry->fmt_uuid[13], + entry->fmt_uuid[14], + entry->fmt_uuid[15], + entry->fmt_name); + } + spin_unlock(&pfm_buffer_fmt_lock); + +} + +static int +pfm_proc_show(struct seq_file *m, void *v) +{ + unsigned long psr; + unsigned int i; + int cpu; + + if (v == PFM_PROC_SHOW_HEADER) { + pfm_proc_show_header(m); + return 0; + } + + /* show info for CPU (v - 1) */ + + cpu = (long)v - 1; + seq_printf(m, + "CPU%-2d overflow intrs : %lu\n" + "CPU%-2d overflow cycles : %lu\n" + "CPU%-2d overflow min : %lu\n" + "CPU%-2d overflow max : %lu\n" + "CPU%-2d smpl handler calls : %lu\n" + "CPU%-2d smpl handler cycles : %lu\n" + "CPU%-2d spurious intrs : %lu\n" + "CPU%-2d replay intrs : %lu\n" + "CPU%-2d syst_wide : %d\n" + "CPU%-2d dcr_pp : %d\n" + "CPU%-2d exclude idle : %d\n" + "CPU%-2d owner : %d\n" + "CPU%-2d context : %p\n" + "CPU%-2d activations : %lu\n", + cpu, pfm_stats[cpu].pfm_ovfl_intr_count, + cpu, pfm_stats[cpu].pfm_ovfl_intr_cycles, + cpu, pfm_stats[cpu].pfm_ovfl_intr_cycles_min, + cpu, pfm_stats[cpu].pfm_ovfl_intr_cycles_max, + cpu, pfm_stats[cpu].pfm_smpl_handler_calls, + cpu, pfm_stats[cpu].pfm_smpl_handler_cycles, + cpu, pfm_stats[cpu].pfm_spurious_ovfl_intr_count, + cpu, pfm_stats[cpu].pfm_replay_ovfl_intr_count, + cpu, pfm_get_cpu_data(pfm_syst_info, cpu) & PFM_CPUINFO_SYST_WIDE ? 1 : 0, + cpu, pfm_get_cpu_data(pfm_syst_info, cpu) & PFM_CPUINFO_DCR_PP ? 1 : 0, + cpu, pfm_get_cpu_data(pfm_syst_info, cpu) & PFM_CPUINFO_EXCL_IDLE ? 1 : 0, + cpu, pfm_get_cpu_data(pmu_owner, cpu) ? pfm_get_cpu_data(pmu_owner, cpu)->pid: -1, + cpu, pfm_get_cpu_data(pmu_ctx, cpu), + cpu, pfm_get_cpu_data(pmu_activation_number, cpu)); + + if (num_online_cpus() == 1 && pfm_sysctl.debug > 0) { + + psr = pfm_get_psr(); + + ia64_srlz_d(); + + seq_printf(m, + "CPU%-2d psr : 0x%lx\n" + "CPU%-2d pmc0 : 0x%lx\n", + cpu, psr, + cpu, ia64_get_pmc(0)); + + for (i=0; PMC_IS_LAST(i) == 0; i++) { + if (PMC_IS_COUNTING(i) == 0) continue; + seq_printf(m, + "CPU%-2d pmc%u : 0x%lx\n" + "CPU%-2d pmd%u : 0x%lx\n", + cpu, i, ia64_get_pmc(i), + cpu, i, ia64_get_pmd(i)); + } + } + return 0; +} + +struct seq_operations pfm_seq_ops = { + .start = pfm_proc_start, + .next = pfm_proc_next, + .stop = pfm_proc_stop, + .show = pfm_proc_show +}; + +static int +pfm_proc_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &pfm_seq_ops); +} + + +/* + * we come here as soon as local_cpu_data->pfm_syst_wide is set. this happens + * during pfm_enable() hence before pfm_start(). We cannot assume monitoring + * is active or inactive based on mode. We must rely on the value in + * local_cpu_data->pfm_syst_info + */ +void +pfm_syst_wide_update_task(struct task_struct *task, unsigned long info, int is_ctxswin) +{ + struct pt_regs *regs; + unsigned long dcr; + unsigned long dcr_pp; + + dcr_pp = info & PFM_CPUINFO_DCR_PP ? 1 : 0; + + /* + * pid 0 is guaranteed to be the idle task. There is one such task with pid 0 + * on every CPU, so we can rely on the pid to identify the idle task. + */ + if ((info & PFM_CPUINFO_EXCL_IDLE) == 0 || task->pid) { + regs = ia64_task_regs(task); + ia64_psr(regs)->pp = is_ctxswin ? dcr_pp : 0; + return; + } + /* + * if monitoring has started + */ + if (dcr_pp) { + dcr = ia64_getreg(_IA64_REG_CR_DCR); + /* + * context switching in? + */ + if (is_ctxswin) { + /* mask monitoring for the idle task */ + ia64_setreg(_IA64_REG_CR_DCR, dcr & ~IA64_DCR_PP); + pfm_clear_psr_pp(); + ia64_srlz_i(); + return; + } + /* + * context switching out + * restore monitoring for next task + * + * Due to inlining this odd if-then-else construction generates + * better code. + */ + ia64_setreg(_IA64_REG_CR_DCR, dcr |IA64_DCR_PP); + pfm_set_psr_pp(); + ia64_srlz_i(); + } +} + +#ifdef CONFIG_SMP + +static void +pfm_force_cleanup(pfm_context_t *ctx, struct pt_regs *regs) +{ + struct task_struct *task = ctx->ctx_task; + + ia64_psr(regs)->up = 0; + ia64_psr(regs)->sp = 1; + + if (GET_PMU_OWNER() == task) { + DPRINT(("cleared ownership for [%d]\n", ctx->ctx_task->pid)); + SET_PMU_OWNER(NULL, NULL); + } + + /* + * disconnect the task from the context and vice-versa + */ + PFM_SET_WORK_PENDING(task, 0); + + task->thread.pfm_context = NULL; + task->thread.flags &= ~IA64_THREAD_PM_VALID; + + DPRINT(("force cleanup for [%d]\n", task->pid)); +} + + +/* + * in 2.6, interrupts are masked when we come here and the runqueue lock is held + */ +void +pfm_save_regs(struct task_struct *task) +{ + pfm_context_t *ctx; + struct thread_struct *t; + unsigned long flags; + u64 psr; + + + ctx = PFM_GET_CTX(task); + if (ctx == NULL) return; + t = &task->thread; + + /* + * we always come here with interrupts ALREADY disabled by + * the scheduler. So we simply need to protect against concurrent + * access, not CPU concurrency. + */ + flags = pfm_protect_ctx_ctxsw(ctx); + + if (ctx->ctx_state == PFM_CTX_ZOMBIE) { + struct pt_regs *regs = ia64_task_regs(task); + + pfm_clear_psr_up(); + + pfm_force_cleanup(ctx, regs); + + BUG_ON(ctx->ctx_smpl_hdr); + + pfm_unprotect_ctx_ctxsw(ctx, flags); + + pfm_context_free(ctx); + return; + } + + /* + * save current PSR: needed because we modify it + */ + ia64_srlz_d(); + psr = pfm_get_psr(); + + BUG_ON(psr & (IA64_PSR_I)); + + /* + * stop monitoring: + * This is the last instruction which may generate an overflow + * + * We do not need to set psr.sp because, it is irrelevant in kernel. + * It will be restored from ipsr when going back to user level + */ + pfm_clear_psr_up(); + + /* + * keep a copy of psr.up (for reload) + */ + ctx->ctx_saved_psr_up = psr & IA64_PSR_UP; + + /* + * release ownership of this PMU. + * PM interrupts are masked, so nothing + * can happen. + */ + SET_PMU_OWNER(NULL, NULL); + + /* + * we systematically save the PMD as we have no + * guarantee we will be schedule at that same + * CPU again. + */ + pfm_save_pmds(t->pmds, ctx->ctx_used_pmds[0]); + + /* + * save pmc0 ia64_srlz_d() done in pfm_save_pmds() + * we will need it on the restore path to check + * for pending overflow. + */ + t->pmcs[0] = ia64_get_pmc(0); + + /* + * unfreeze PMU if had pending overflows + */ + if (t->pmcs[0] & ~0x1UL) pfm_unfreeze_pmu(); + + /* + * finally, allow context access. + * interrupts will still be masked after this call. + */ + pfm_unprotect_ctx_ctxsw(ctx, flags); +} + +#else /* !CONFIG_SMP */ +void +pfm_save_regs(struct task_struct *task) +{ + pfm_context_t *ctx; + u64 psr; + + ctx = PFM_GET_CTX(task); + if (ctx == NULL) return; + + /* + * save current PSR: needed because we modify it + */ + psr = pfm_get_psr(); + + BUG_ON(psr & (IA64_PSR_I)); + + /* + * stop monitoring: + * This is the last instruction which may generate an overflow + * + * We do not need to set psr.sp because, it is irrelevant in kernel. + * It will be restored from ipsr when going back to user level + */ + pfm_clear_psr_up(); + + /* + * keep a copy of psr.up (for reload) + */ + ctx->ctx_saved_psr_up = psr & IA64_PSR_UP; +} + +static void +pfm_lazy_save_regs (struct task_struct *task) +{ + pfm_context_t *ctx; + struct thread_struct *t; + unsigned long flags; + + { u64 psr = pfm_get_psr(); + BUG_ON(psr & IA64_PSR_UP); + } + + ctx = PFM_GET_CTX(task); + t = &task->thread; + + /* + * we need to mask PMU overflow here to + * make sure that we maintain pmc0 until + * we save it. overflow interrupts are + * treated as spurious if there is no + * owner. + * + * XXX: I don't think this is necessary + */ + PROTECT_CTX(ctx,flags); + + /* + * release ownership of this PMU. + * must be done before we save the registers. + * + * after this call any PMU interrupt is treated + * as spurious. + */ + SET_PMU_OWNER(NULL, NULL); + + /* + * save all the pmds we use + */ + pfm_save_pmds(t->pmds, ctx->ctx_used_pmds[0]); + + /* + * save pmc0 ia64_srlz_d() done in pfm_save_pmds() + * it is needed to check for pended overflow + * on the restore path + */ + t->pmcs[0] = ia64_get_pmc(0); + + /* + * unfreeze PMU if had pending overflows + */ + if (t->pmcs[0] & ~0x1UL) pfm_unfreeze_pmu(); + + /* + * now get can unmask PMU interrupts, they will + * be treated as purely spurious and we will not + * lose any information + */ + UNPROTECT_CTX(ctx,flags); +} +#endif /* CONFIG_SMP */ + +#ifdef CONFIG_SMP +/* + * in 2.6, interrupts are masked when we come here and the runqueue lock is held + */ +void +pfm_load_regs (struct task_struct *task) +{ + pfm_context_t *ctx; + struct thread_struct *t; + unsigned long pmc_mask = 0UL, pmd_mask = 0UL; + unsigned long flags; + u64 psr, psr_up; + int need_irq_resend; + + ctx = PFM_GET_CTX(task); + if (unlikely(ctx == NULL)) return; + + BUG_ON(GET_PMU_OWNER()); + + t = &task->thread; + /* + * possible on unload + */ + if (unlikely((t->flags & IA64_THREAD_PM_VALID) == 0)) return; + + /* + * we always come here with interrupts ALREADY disabled by + * the scheduler. So we simply need to protect against concurrent + * access, not CPU concurrency. + */ + flags = pfm_protect_ctx_ctxsw(ctx); + psr = pfm_get_psr(); + + need_irq_resend = pmu_conf->flags & PFM_PMU_IRQ_RESEND; + + BUG_ON(psr & (IA64_PSR_UP|IA64_PSR_PP)); + BUG_ON(psr & IA64_PSR_I); + + if (unlikely(ctx->ctx_state == PFM_CTX_ZOMBIE)) { + struct pt_regs *regs = ia64_task_regs(task); + + BUG_ON(ctx->ctx_smpl_hdr); + + pfm_force_cleanup(ctx, regs); + + pfm_unprotect_ctx_ctxsw(ctx, flags); + + /* + * this one (kmalloc'ed) is fine with interrupts disabled + */ + pfm_context_free(ctx); + + return; + } + + /* + * we restore ALL the debug registers to avoid picking up + * stale state. + */ + if (ctx->ctx_fl_using_dbreg) { + pfm_restore_ibrs(ctx->ctx_ibrs, pmu_conf->num_ibrs); + pfm_restore_dbrs(ctx->ctx_dbrs, pmu_conf->num_dbrs); + } + /* + * retrieve saved psr.up + */ + psr_up = ctx->ctx_saved_psr_up; + + /* + * if we were the last user of the PMU on that CPU, + * then nothing to do except restore psr + */ + if (GET_LAST_CPU(ctx) == smp_processor_id() && ctx->ctx_last_activation == GET_ACTIVATION()) { + + /* + * retrieve partial reload masks (due to user modifications) + */ + pmc_mask = ctx->ctx_reload_pmcs[0]; + pmd_mask = ctx->ctx_reload_pmds[0]; + + } else { + /* + * To avoid leaking information to the user level when psr.sp=0, + * we must reload ALL implemented pmds (even the ones we don't use). + * In the kernel we only allow PFM_READ_PMDS on registers which + * we initialized or requested (sampling) so there is no risk there. + */ + pmd_mask = pfm_sysctl.fastctxsw ? ctx->ctx_used_pmds[0] : ctx->ctx_all_pmds[0]; + + /* + * ALL accessible PMCs are systematically reloaded, unused registers + * get their default (from pfm_reset_pmu_state()) values to avoid picking + * up stale configuration. + * + * PMC0 is never in the mask. It is always restored separately. + */ + pmc_mask = ctx->ctx_all_pmcs[0]; + } + /* + * when context is MASKED, we will restore PMC with plm=0 + * and PMD with stale information, but that's ok, nothing + * will be captured. + * + * XXX: optimize here + */ + if (pmd_mask) pfm_restore_pmds(t->pmds, pmd_mask); + if (pmc_mask) pfm_restore_pmcs(t->pmcs, pmc_mask); + + /* + * check for pending overflow at the time the state + * was saved. + */ + if (unlikely(PMC0_HAS_OVFL(t->pmcs[0]))) { + /* + * reload pmc0 with the overflow information + * On McKinley PMU, this will trigger a PMU interrupt + */ + ia64_set_pmc(0, t->pmcs[0]); + ia64_srlz_d(); + t->pmcs[0] = 0UL; + + /* + * will replay the PMU interrupt + */ + if (need_irq_resend) hw_resend_irq(NULL, IA64_PERFMON_VECTOR); + + pfm_stats[smp_processor_id()].pfm_replay_ovfl_intr_count++; + } + + /* + * we just did a reload, so we reset the partial reload fields + */ + ctx->ctx_reload_pmcs[0] = 0UL; + ctx->ctx_reload_pmds[0] = 0UL; + + SET_LAST_CPU(ctx, smp_processor_id()); + + /* + * dump activation value for this PMU + */ + INC_ACTIVATION(); + /* + * record current activation for this context + */ + SET_ACTIVATION(ctx); + + /* + * establish new ownership. + */ + SET_PMU_OWNER(task, ctx); + + /* + * restore the psr.up bit. measurement + * is active again. + * no PMU interrupt can happen at this point + * because we still have interrupts disabled. + */ + if (likely(psr_up)) pfm_set_psr_up(); + + /* + * allow concurrent access to context + */ + pfm_unprotect_ctx_ctxsw(ctx, flags); +} +#else /* !CONFIG_SMP */ +/* + * reload PMU state for UP kernels + * in 2.5 we come here with interrupts disabled + */ +void +pfm_load_regs (struct task_struct *task) +{ + struct thread_struct *t; + pfm_context_t *ctx; + struct task_struct *owner; + unsigned long pmd_mask, pmc_mask; + u64 psr, psr_up; + int need_irq_resend; + + owner = GET_PMU_OWNER(); + ctx = PFM_GET_CTX(task); + t = &task->thread; + psr = pfm_get_psr(); + + BUG_ON(psr & (IA64_PSR_UP|IA64_PSR_PP)); + BUG_ON(psr & IA64_PSR_I); + + /* + * we restore ALL the debug registers to avoid picking up + * stale state. + * + * This must be done even when the task is still the owner + * as the registers may have been modified via ptrace() + * (not perfmon) by the previous task. + */ + if (ctx->ctx_fl_using_dbreg) { + pfm_restore_ibrs(ctx->ctx_ibrs, pmu_conf->num_ibrs); + pfm_restore_dbrs(ctx->ctx_dbrs, pmu_conf->num_dbrs); + } + + /* + * retrieved saved psr.up + */ + psr_up = ctx->ctx_saved_psr_up; + need_irq_resend = pmu_conf->flags & PFM_PMU_IRQ_RESEND; + + /* + * short path, our state is still there, just + * need to restore psr and we go + * + * we do not touch either PMC nor PMD. the psr is not touched + * by the overflow_handler. So we are safe w.r.t. to interrupt + * concurrency even without interrupt masking. + */ + if (likely(owner == task)) { + if (likely(psr_up)) pfm_set_psr_up(); + return; + } + + /* + * someone else is still using the PMU, first push it out and + * then we'll be able to install our stuff ! + * + * Upon return, there will be no owner for the current PMU + */ + if (owner) pfm_lazy_save_regs(owner); + + /* + * To avoid leaking information to the user level when psr.sp=0, + * we must reload ALL implemented pmds (even the ones we don't use). + * In the kernel we only allow PFM_READ_PMDS on registers which + * we initialized or requested (sampling) so there is no risk there. + */ + pmd_mask = pfm_sysctl.fastctxsw ? ctx->ctx_used_pmds[0] : ctx->ctx_all_pmds[0]; + + /* + * ALL accessible PMCs are systematically reloaded, unused registers + * get their default (from pfm_reset_pmu_state()) values to avoid picking + * up stale configuration. + * + * PMC0 is never in the mask. It is always restored separately + */ + pmc_mask = ctx->ctx_all_pmcs[0]; + + pfm_restore_pmds(t->pmds, pmd_mask); + pfm_restore_pmcs(t->pmcs, pmc_mask); + + /* + * check for pending overflow at the time the state + * was saved. + */ + if (unlikely(PMC0_HAS_OVFL(t->pmcs[0]))) { + /* + * reload pmc0 with the overflow information + * On McKinley PMU, this will trigger a PMU interrupt + */ + ia64_set_pmc(0, t->pmcs[0]); + ia64_srlz_d(); + + t->pmcs[0] = 0UL; + + /* + * will replay the PMU interrupt + */ + if (need_irq_resend) hw_resend_irq(NULL, IA64_PERFMON_VECTOR); + + pfm_stats[smp_processor_id()].pfm_replay_ovfl_intr_count++; + } + + /* + * establish new ownership. + */ + SET_PMU_OWNER(task, ctx); + + /* + * restore the psr.up bit. measurement + * is active again. + * no PMU interrupt can happen at this point + * because we still have interrupts disabled. + */ + if (likely(psr_up)) pfm_set_psr_up(); +} +#endif /* CONFIG_SMP */ + +/* + * this function assumes monitoring is stopped + */ +static void +pfm_flush_pmds(struct task_struct *task, pfm_context_t *ctx) +{ + u64 pmc0; + unsigned long mask2, val, pmd_val, ovfl_val; + int i, can_access_pmu = 0; + int is_self; + + /* + * is the caller the task being monitored (or which initiated the + * session for system wide measurements) + */ + is_self = ctx->ctx_task == task ? 1 : 0; + + /* + * can access PMU is task is the owner of the PMU state on the current CPU + * or if we are running on the CPU bound to the context in system-wide mode + * (that is not necessarily the task the context is attached to in this mode). + * In system-wide we always have can_access_pmu true because a task running on an + * invalid processor is flagged earlier in the call stack (see pfm_stop). + */ + can_access_pmu = (GET_PMU_OWNER() == task) || (ctx->ctx_fl_system && ctx->ctx_cpu == smp_processor_id()); + if (can_access_pmu) { + /* + * Mark the PMU as not owned + * This will cause the interrupt handler to do nothing in case an overflow + * interrupt was in-flight + * This also guarantees that pmc0 will contain the final state + * It virtually gives us full control on overflow processing from that point + * on. + */ + SET_PMU_OWNER(NULL, NULL); + DPRINT(("releasing ownership\n")); + + /* + * read current overflow status: + * + * we are guaranteed to read the final stable state + */ + ia64_srlz_d(); + pmc0 = ia64_get_pmc(0); /* slow */ + + /* + * reset freeze bit, overflow status information destroyed + */ + pfm_unfreeze_pmu(); + } else { + pmc0 = task->thread.pmcs[0]; + /* + * clear whatever overflow status bits there were + */ + task->thread.pmcs[0] = 0; + } + ovfl_val = pmu_conf->ovfl_val; + /* + * we save all the used pmds + * we take care of overflows for counting PMDs + * + * XXX: sampling situation is not taken into account here + */ + mask2 = ctx->ctx_used_pmds[0]; + + DPRINT(("is_self=%d ovfl_val=0x%lx mask2=0x%lx\n", is_self, ovfl_val, mask2)); + + for (i = 0; mask2; i++, mask2>>=1) { + + /* skip non used pmds */ + if ((mask2 & 0x1) == 0) continue; + + /* + * can access PMU always true in system wide mode + */ + val = pmd_val = can_access_pmu ? ia64_get_pmd(i) : task->thread.pmds[i]; + + if (PMD_IS_COUNTING(i)) { + DPRINT(("[%d] pmd[%d] ctx_pmd=0x%lx hw_pmd=0x%lx\n", + task->pid, + i, + ctx->ctx_pmds[i].val, + val & ovfl_val)); + + /* + * we rebuild the full 64 bit value of the counter + */ + val = ctx->ctx_pmds[i].val + (val & ovfl_val); + + /* + * now everything is in ctx_pmds[] and we need + * to clear the saved context from save_regs() such that + * pfm_read_pmds() gets the correct value + */ + pmd_val = 0UL; + + /* + * take care of overflow inline + */ + if (pmc0 & (1UL << i)) { + val += 1 + ovfl_val; + DPRINT(("[%d] pmd[%d] overflowed\n", task->pid, i)); + } + } + + DPRINT(("[%d] ctx_pmd[%d]=0x%lx pmd_val=0x%lx\n", task->pid, i, val, pmd_val)); + + if (is_self) task->thread.pmds[i] = pmd_val; + + ctx->ctx_pmds[i].val = val; + } +} + +static struct irqaction perfmon_irqaction = { + .handler = pfm_interrupt_handler, + .flags = SA_INTERRUPT, + .name = "perfmon" +}; + +/* + * perfmon initialization routine, called from the initcall() table + */ +static int init_pfm_fs(void); + +static int __init +pfm_probe_pmu(void) +{ + pmu_config_t **p; + int family; + + family = local_cpu_data->family; + p = pmu_confs; + + while(*p) { + if ((*p)->probe) { + if ((*p)->probe() == 0) goto found; + } else if ((*p)->pmu_family == family || (*p)->pmu_family == 0xff) { + goto found; + } + p++; + } + return -1; +found: + pmu_conf = *p; + return 0; +} + +static struct file_operations pfm_proc_fops = { + .open = pfm_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +int __init +pfm_init(void) +{ + unsigned int n, n_counters, i; + + printk("perfmon: version %u.%u IRQ %u\n", + PFM_VERSION_MAJ, + PFM_VERSION_MIN, + IA64_PERFMON_VECTOR); + + if (pfm_probe_pmu()) { + printk(KERN_INFO "perfmon: disabled, there is no support for processor family %d\n", + local_cpu_data->family); + return -ENODEV; + } + + /* + * compute the number of implemented PMD/PMC from the + * description tables + */ + n = 0; + for (i=0; PMC_IS_LAST(i) == 0; i++) { + if (PMC_IS_IMPL(i) == 0) continue; + pmu_conf->impl_pmcs[i>>6] |= 1UL << (i&63); + n++; + } + pmu_conf->num_pmcs = n; + + n = 0; n_counters = 0; + for (i=0; PMD_IS_LAST(i) == 0; i++) { + if (PMD_IS_IMPL(i) == 0) continue; + pmu_conf->impl_pmds[i>>6] |= 1UL << (i&63); + n++; + if (PMD_IS_COUNTING(i)) n_counters++; + } + pmu_conf->num_pmds = n; + pmu_conf->num_counters = n_counters; + + /* + * sanity checks on the number of debug registers + */ + if (pmu_conf->use_rr_dbregs) { + if (pmu_conf->num_ibrs > IA64_NUM_DBG_REGS) { + printk(KERN_INFO "perfmon: unsupported number of code debug registers (%u)\n", pmu_conf->num_ibrs); + pmu_conf = NULL; + return -1; + } + if (pmu_conf->num_dbrs > IA64_NUM_DBG_REGS) { + printk(KERN_INFO "perfmon: unsupported number of data debug registers (%u)\n", pmu_conf->num_ibrs); + pmu_conf = NULL; + return -1; + } + } + + printk("perfmon: %s PMU detected, %u PMCs, %u PMDs, %u counters (%lu bits)\n", + pmu_conf->pmu_name, + pmu_conf->num_pmcs, + pmu_conf->num_pmds, + pmu_conf->num_counters, + ffz(pmu_conf->ovfl_val)); + + /* sanity check */ + if (pmu_conf->num_pmds >= IA64_NUM_PMD_REGS || pmu_conf->num_pmcs >= IA64_NUM_PMC_REGS) { + printk(KERN_ERR "perfmon: not enough pmc/pmd, perfmon disabled\n"); + pmu_conf = NULL; + return -1; + } + + /* + * create /proc/perfmon (mostly for debugging purposes) + */ + perfmon_dir = create_proc_entry("perfmon", S_IRUGO, NULL); + if (perfmon_dir == NULL) { + printk(KERN_ERR "perfmon: cannot create /proc entry, perfmon disabled\n"); + pmu_conf = NULL; + return -1; + } + /* + * install customized file operations for /proc/perfmon entry + */ + perfmon_dir->proc_fops = &pfm_proc_fops; + + /* + * create /proc/sys/kernel/perfmon (for debugging purposes) + */ + pfm_sysctl_header = register_sysctl_table(pfm_sysctl_root, 0); + + /* + * initialize all our spinlocks + */ + spin_lock_init(&pfm_sessions.pfs_lock); + spin_lock_init(&pfm_buffer_fmt_lock); + + init_pfm_fs(); + + for(i=0; i < NR_CPUS; i++) pfm_stats[i].pfm_ovfl_intr_cycles_min = ~0UL; + + return 0; +} + +__initcall(pfm_init); + +/* + * this function is called before pfm_init() + */ +void +pfm_init_percpu (void) +{ + /* + * make sure no measurement is active + * (may inherit programmed PMCs from EFI). + */ + pfm_clear_psr_pp(); + pfm_clear_psr_up(); + + /* + * we run with the PMU not frozen at all times + */ + pfm_unfreeze_pmu(); + + if (smp_processor_id() == 0) + register_percpu_irq(IA64_PERFMON_VECTOR, &perfmon_irqaction); + + ia64_setreg(_IA64_REG_CR_PMV, IA64_PERFMON_VECTOR); + ia64_srlz_d(); +} + +/* + * used for debug purposes only + */ +void +dump_pmu_state(const char *from) +{ + struct task_struct *task; + struct thread_struct *t; + struct pt_regs *regs; + pfm_context_t *ctx; + unsigned long psr, dcr, info, flags; + int i, this_cpu; + + local_irq_save(flags); + + this_cpu = smp_processor_id(); + regs = ia64_task_regs(current); + info = PFM_CPUINFO_GET(); + dcr = ia64_getreg(_IA64_REG_CR_DCR); + + if (info == 0 && ia64_psr(regs)->pp == 0 && (dcr & IA64_DCR_PP) == 0) { + local_irq_restore(flags); + return; + } + + printk("CPU%d from %s() current [%d] iip=0x%lx %s\n", + this_cpu, + from, + current->pid, + regs->cr_iip, + current->comm); + + task = GET_PMU_OWNER(); + ctx = GET_PMU_CTX(); + + printk("->CPU%d owner [%d] ctx=%p\n", this_cpu, task ? task->pid : -1, ctx); + + psr = pfm_get_psr(); + + printk("->CPU%d pmc0=0x%lx psr.pp=%d psr.up=%d dcr.pp=%d syst_info=0x%lx user_psr.up=%d user_psr.pp=%d\n", + this_cpu, + ia64_get_pmc(0), + psr & IA64_PSR_PP ? 1 : 0, + psr & IA64_PSR_UP ? 1 : 0, + dcr & IA64_DCR_PP ? 1 : 0, + info, + ia64_psr(regs)->up, + ia64_psr(regs)->pp); + + ia64_psr(regs)->up = 0; + ia64_psr(regs)->pp = 0; + + t = ¤t->thread; + + for (i=1; PMC_IS_LAST(i) == 0; i++) { + if (PMC_IS_IMPL(i) == 0) continue; + printk("->CPU%d pmc[%d]=0x%lx thread_pmc[%d]=0x%lx\n", this_cpu, i, ia64_get_pmc(i), i, t->pmcs[i]); + } + + for (i=1; PMD_IS_LAST(i) == 0; i++) { + if (PMD_IS_IMPL(i) == 0) continue; + printk("->CPU%d pmd[%d]=0x%lx thread_pmd[%d]=0x%lx\n", this_cpu, i, ia64_get_pmd(i), i, t->pmds[i]); + } + + if (ctx) { + printk("->CPU%d ctx_state=%d vaddr=%p addr=%p fd=%d ctx_task=[%d] saved_psr_up=0x%lx\n", + this_cpu, + ctx->ctx_state, + ctx->ctx_smpl_vaddr, + ctx->ctx_smpl_hdr, + ctx->ctx_msgq_head, + ctx->ctx_msgq_tail, + ctx->ctx_saved_psr_up); + } + local_irq_restore(flags); +} + +/* + * called from process.c:copy_thread(). task is new child. + */ +void +pfm_inherit(struct task_struct *task, struct pt_regs *regs) +{ + struct thread_struct *thread; + + DPRINT(("perfmon: pfm_inherit clearing state for [%d]\n", task->pid)); + + thread = &task->thread; + + /* + * cut links inherited from parent (current) + */ + thread->pfm_context = NULL; + + PFM_SET_WORK_PENDING(task, 0); + + /* + * the psr bits are already set properly in copy_threads() + */ +} +#else /* !CONFIG_PERFMON */ +asmlinkage long +sys_perfmonctl (int fd, int cmd, void *arg, int count) +{ + return -ENOSYS; +} +#endif /* CONFIG_PERFMON */ diff --git a/arch/ia64/kernel/perfmon_default_smpl.c b/arch/ia64/kernel/perfmon_default_smpl.c new file mode 100644 index 000000000000..965d29004555 --- /dev/null +++ b/arch/ia64/kernel/perfmon_default_smpl.c @@ -0,0 +1,306 @@ +/* + * Copyright (C) 2002-2003 Hewlett-Packard Co + * Stephane Eranian <eranian@hpl.hp.com> + * + * This file implements the default sampling buffer format + * for the Linux/ia64 perfmon-2 subsystem. + */ +#include <linux/kernel.h> +#include <linux/types.h> +#include <linux/module.h> +#include <linux/config.h> +#include <linux/init.h> +#include <asm/delay.h> +#include <linux/smp.h> + +#include <asm/perfmon.h> +#include <asm/perfmon_default_smpl.h> + +MODULE_AUTHOR("Stephane Eranian <eranian@hpl.hp.com>"); +MODULE_DESCRIPTION("perfmon default sampling format"); +MODULE_LICENSE("GPL"); + +MODULE_PARM(debug, "i"); +MODULE_PARM_DESC(debug, "debug"); + +MODULE_PARM(debug_ovfl, "i"); +MODULE_PARM_DESC(debug_ovfl, "debug ovfl"); + + +#define DEFAULT_DEBUG 1 + +#ifdef DEFAULT_DEBUG +#define DPRINT(a) \ + do { \ + if (unlikely(debug >0)) { printk("%s.%d: CPU%d ", __FUNCTION__, __LINE__, smp_processor_id()); printk a; } \ + } while (0) + +#define DPRINT_ovfl(a) \ + do { \ + if (unlikely(debug_ovfl >0)) { printk("%s.%d: CPU%d ", __FUNCTION__, __LINE__, smp_processor_id()); printk a; } \ + } while (0) + +#else +#define DPRINT(a) +#define DPRINT_ovfl(a) +#endif + +static int debug, debug_ovfl; + +static int +default_validate(struct task_struct *task, unsigned int flags, int cpu, void *data) +{ + pfm_default_smpl_arg_t *arg = (pfm_default_smpl_arg_t*)data; + int ret = 0; + + if (data == NULL) { + DPRINT(("[%d] no argument passed\n", task->pid)); + return -EINVAL; + } + + DPRINT(("[%d] validate flags=0x%x CPU%d\n", task->pid, flags, cpu)); + + /* + * must hold at least the buffer header + one minimally sized entry + */ + if (arg->buf_size < PFM_DEFAULT_SMPL_MIN_BUF_SIZE) return -EINVAL; + + DPRINT(("buf_size=%lu\n", arg->buf_size)); + + return ret; +} + +static int +default_get_size(struct task_struct *task, unsigned int flags, int cpu, void *data, unsigned long *size) +{ + pfm_default_smpl_arg_t *arg = (pfm_default_smpl_arg_t *)data; + + /* + * size has been validated in default_validate + */ + *size = arg->buf_size; + + return 0; +} + +static int +default_init(struct task_struct *task, void *buf, unsigned int flags, int cpu, void *data) +{ + pfm_default_smpl_hdr_t *hdr; + pfm_default_smpl_arg_t *arg = (pfm_default_smpl_arg_t *)data; + + hdr = (pfm_default_smpl_hdr_t *)buf; + + hdr->hdr_version = PFM_DEFAULT_SMPL_VERSION; + hdr->hdr_buf_size = arg->buf_size; + hdr->hdr_cur_offs = sizeof(*hdr); + hdr->hdr_overflows = 0UL; + hdr->hdr_count = 0UL; + + DPRINT(("[%d] buffer=%p buf_size=%lu hdr_size=%lu hdr_version=%u cur_offs=%lu\n", + task->pid, + buf, + hdr->hdr_buf_size, + sizeof(*hdr), + hdr->hdr_version, + hdr->hdr_cur_offs)); + + return 0; +} + +static int +default_handler(struct task_struct *task, void *buf, pfm_ovfl_arg_t *arg, struct pt_regs *regs, unsigned long stamp) +{ + pfm_default_smpl_hdr_t *hdr; + pfm_default_smpl_entry_t *ent; + void *cur, *last; + unsigned long *e, entry_size; + unsigned int npmds, i; + unsigned char ovfl_pmd; + unsigned char ovfl_notify; + + if (unlikely(buf == NULL || arg == NULL|| regs == NULL || task == NULL)) { + DPRINT(("[%d] invalid arguments buf=%p arg=%p\n", task->pid, buf, arg)); + return -EINVAL; + } + + hdr = (pfm_default_smpl_hdr_t *)buf; + cur = buf+hdr->hdr_cur_offs; + last = buf+hdr->hdr_buf_size; + ovfl_pmd = arg->ovfl_pmd; + ovfl_notify = arg->ovfl_notify; + + /* + * precheck for sanity + */ + if ((last - cur) < PFM_DEFAULT_MAX_ENTRY_SIZE) goto full; + + npmds = hweight64(arg->smpl_pmds[0]); + + ent = (pfm_default_smpl_entry_t *)cur; + + prefetch(arg->smpl_pmds_values); + + entry_size = sizeof(*ent) + (npmds << 3); + + /* position for first pmd */ + e = (unsigned long *)(ent+1); + + hdr->hdr_count++; + + DPRINT_ovfl(("[%d] count=%lu cur=%p last=%p free_bytes=%lu ovfl_pmd=%d ovfl_notify=%d npmds=%u\n", + task->pid, + hdr->hdr_count, + cur, last, + last-cur, + ovfl_pmd, + ovfl_notify, npmds)); + + /* + * current = task running at the time of the overflow. + * + * per-task mode: + * - this is ususally the task being monitored. + * Under certain conditions, it might be a different task + * + * system-wide: + * - this is not necessarily the task controlling the session + */ + ent->pid = current->pid; + ent->ovfl_pmd = ovfl_pmd; + ent->last_reset_val = arg->pmd_last_reset; //pmd[0].reg_last_reset_val; + + /* + * where did the fault happen (includes slot number) + */ + ent->ip = regs->cr_iip | ((regs->cr_ipsr >> 41) & 0x3); + + ent->tstamp = stamp; + ent->cpu = smp_processor_id(); + ent->set = arg->active_set; + ent->tgid = current->tgid; + + /* + * selectively store PMDs in increasing index number + */ + if (npmds) { + unsigned long *val = arg->smpl_pmds_values; + for(i=0; i < npmds; i++) { + *e++ = *val++; + } + } + + /* + * update position for next entry + */ + hdr->hdr_cur_offs += entry_size; + cur += entry_size; + + /* + * post check to avoid losing the last sample + */ + if ((last - cur) < PFM_DEFAULT_MAX_ENTRY_SIZE) goto full; + + /* + * keep same ovfl_pmds, ovfl_notify + */ + arg->ovfl_ctrl.bits.notify_user = 0; + arg->ovfl_ctrl.bits.block_task = 0; + arg->ovfl_ctrl.bits.mask_monitoring = 0; + arg->ovfl_ctrl.bits.reset_ovfl_pmds = 1; /* reset before returning from interrupt handler */ + + return 0; +full: + DPRINT_ovfl(("sampling buffer full free=%lu, count=%lu, ovfl_notify=%d\n", last-cur, hdr->hdr_count, ovfl_notify)); + + /* + * increment number of buffer overflow. + * important to detect duplicate set of samples. + */ + hdr->hdr_overflows++; + + /* + * if no notification requested, then we saturate the buffer + */ + if (ovfl_notify == 0) { + arg->ovfl_ctrl.bits.notify_user = 0; + arg->ovfl_ctrl.bits.block_task = 0; + arg->ovfl_ctrl.bits.mask_monitoring = 1; + arg->ovfl_ctrl.bits.reset_ovfl_pmds = 0; + } else { + arg->ovfl_ctrl.bits.notify_user = 1; + arg->ovfl_ctrl.bits.block_task = 1; /* ignored for non-blocking context */ + arg->ovfl_ctrl.bits.mask_monitoring = 1; + arg->ovfl_ctrl.bits.reset_ovfl_pmds = 0; /* no reset now */ + } + return -1; /* we are full, sorry */ +} + +static int +default_restart(struct task_struct *task, pfm_ovfl_ctrl_t *ctrl, void *buf, struct pt_regs *regs) +{ + pfm_default_smpl_hdr_t *hdr; + + hdr = (pfm_default_smpl_hdr_t *)buf; + + hdr->hdr_count = 0UL; + hdr->hdr_cur_offs = sizeof(*hdr); + + ctrl->bits.mask_monitoring = 0; + ctrl->bits.reset_ovfl_pmds = 1; /* uses long-reset values */ + + return 0; +} + +static int +default_exit(struct task_struct *task, void *buf, struct pt_regs *regs) +{ + DPRINT(("[%d] exit(%p)\n", task->pid, buf)); + return 0; +} + +static pfm_buffer_fmt_t default_fmt={ + .fmt_name = "default_format", + .fmt_uuid = PFM_DEFAULT_SMPL_UUID, + .fmt_arg_size = sizeof(pfm_default_smpl_arg_t), + .fmt_validate = default_validate, + .fmt_getsize = default_get_size, + .fmt_init = default_init, + .fmt_handler = default_handler, + .fmt_restart = default_restart, + .fmt_restart_active = default_restart, + .fmt_exit = default_exit, +}; + +static int __init +pfm_default_smpl_init_module(void) +{ + int ret; + + ret = pfm_register_buffer_fmt(&default_fmt); + if (ret == 0) { + printk("perfmon_default_smpl: %s v%u.%u registered\n", + default_fmt.fmt_name, + PFM_DEFAULT_SMPL_VERSION_MAJ, + PFM_DEFAULT_SMPL_VERSION_MIN); + } else { + printk("perfmon_default_smpl: %s cannot register ret=%d\n", + default_fmt.fmt_name, + ret); + } + + return ret; +} + +static void __exit +pfm_default_smpl_cleanup_module(void) +{ + int ret; + ret = pfm_unregister_buffer_fmt(default_fmt.fmt_uuid); + + printk("perfmon_default_smpl: unregister %s=%d\n", default_fmt.fmt_name, ret); +} + +module_init(pfm_default_smpl_init_module); +module_exit(pfm_default_smpl_cleanup_module); + diff --git a/arch/ia64/kernel/perfmon_generic.h b/arch/ia64/kernel/perfmon_generic.h new file mode 100644 index 000000000000..67489478041e --- /dev/null +++ b/arch/ia64/kernel/perfmon_generic.h @@ -0,0 +1,45 @@ +/* + * This file contains the generic PMU register description tables + * and pmc checker used by perfmon.c. + * + * Copyright (C) 2002-2003 Hewlett Packard Co + * Stephane Eranian <eranian@hpl.hp.com> + */ + +static pfm_reg_desc_t pfm_gen_pmc_desc[PMU_MAX_PMCS]={ +/* pmc0 */ { PFM_REG_CONTROL , 0, 0x1UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, +/* pmc1 */ { PFM_REG_CONTROL , 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, +/* pmc2 */ { PFM_REG_CONTROL , 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, +/* pmc3 */ { PFM_REG_CONTROL , 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, +/* pmc4 */ { PFM_REG_COUNTING, 0, 0x0UL, -1UL, NULL, NULL, {RDEP(4),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, +/* pmc5 */ { PFM_REG_COUNTING, 0, 0x0UL, -1UL, NULL, NULL, {RDEP(5),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, +/* pmc6 */ { PFM_REG_COUNTING, 0, 0x0UL, -1UL, NULL, NULL, {RDEP(6),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, +/* pmc7 */ { PFM_REG_COUNTING, 0, 0x0UL, -1UL, NULL, NULL, {RDEP(7),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, + { PFM_REG_END , 0, 0x0UL, -1UL, NULL, NULL, {0,}, {0,}}, /* end marker */ +}; + +static pfm_reg_desc_t pfm_gen_pmd_desc[PMU_MAX_PMDS]={ +/* pmd0 */ { PFM_REG_NOTIMPL , 0, 0x0UL, -1UL, NULL, NULL, {0,}, {0,}}, +/* pmd1 */ { PFM_REG_NOTIMPL , 0, 0x0UL, -1UL, NULL, NULL, {0,}, {0,}}, +/* pmd2 */ { PFM_REG_NOTIMPL , 0, 0x0UL, -1UL, NULL, NULL, {0,}, {0,}}, +/* pmd3 */ { PFM_REG_NOTIMPL , 0, 0x0UL, -1UL, NULL, NULL, {0,}, {0,}}, +/* pmd4 */ { PFM_REG_COUNTING, 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {RDEP(4),0UL, 0UL, 0UL}}, +/* pmd5 */ { PFM_REG_COUNTING, 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {RDEP(5),0UL, 0UL, 0UL}}, +/* pmd6 */ { PFM_REG_COUNTING, 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {RDEP(6),0UL, 0UL, 0UL}}, +/* pmd7 */ { PFM_REG_COUNTING, 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {RDEP(7),0UL, 0UL, 0UL}}, + { PFM_REG_END , 0, 0x0UL, -1UL, NULL, NULL, {0,}, {0,}}, /* end marker */ +}; + +/* + * impl_pmcs, impl_pmds are computed at runtime to minimize errors! + */ +static pmu_config_t pmu_conf_gen={ + .pmu_name = "Generic", + .pmu_family = 0xff, /* any */ + .ovfl_val = (1UL << 32) - 1, + .num_ibrs = 0, /* does not use */ + .num_dbrs = 0, /* does not use */ + .pmd_desc = pfm_gen_pmd_desc, + .pmc_desc = pfm_gen_pmc_desc +}; + diff --git a/arch/ia64/kernel/perfmon_itanium.h b/arch/ia64/kernel/perfmon_itanium.h new file mode 100644 index 000000000000..d1d508a0fbd3 --- /dev/null +++ b/arch/ia64/kernel/perfmon_itanium.h @@ -0,0 +1,115 @@ +/* + * This file contains the Itanium PMU register description tables + * and pmc checker used by perfmon.c. + * + * Copyright (C) 2002-2003 Hewlett Packard Co + * Stephane Eranian <eranian@hpl.hp.com> + */ +static int pfm_ita_pmc_check(struct task_struct *task, pfm_context_t *ctx, unsigned int cnum, unsigned long *val, struct pt_regs *regs); + +static pfm_reg_desc_t pfm_ita_pmc_desc[PMU_MAX_PMCS]={ +/* pmc0 */ { PFM_REG_CONTROL , 0, 0x1UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, +/* pmc1 */ { PFM_REG_CONTROL , 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, +/* pmc2 */ { PFM_REG_CONTROL , 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, +/* pmc3 */ { PFM_REG_CONTROL , 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, +/* pmc4 */ { PFM_REG_COUNTING, 6, 0x0UL, -1UL, NULL, NULL, {RDEP(4),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, +/* pmc5 */ { PFM_REG_COUNTING, 6, 0x0UL, -1UL, NULL, NULL, {RDEP(5),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, +/* pmc6 */ { PFM_REG_COUNTING, 6, 0x0UL, -1UL, NULL, NULL, {RDEP(6),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, +/* pmc7 */ { PFM_REG_COUNTING, 6, 0x0UL, -1UL, NULL, NULL, {RDEP(7),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, +/* pmc8 */ { PFM_REG_CONFIG , 0, 0xf00000003ffffff8UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, +/* pmc9 */ { PFM_REG_CONFIG , 0, 0xf00000003ffffff8UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, +/* pmc10 */ { PFM_REG_MONITOR , 6, 0x0UL, -1UL, NULL, NULL, {RDEP(0)|RDEP(1),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, +/* pmc11 */ { PFM_REG_MONITOR , 6, 0x0000000010000000UL, -1UL, NULL, pfm_ita_pmc_check, {RDEP(2)|RDEP(3)|RDEP(17),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, +/* pmc12 */ { PFM_REG_MONITOR , 6, 0x0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, +/* pmc13 */ { PFM_REG_CONFIG , 0, 0x0003ffff00000001UL, -1UL, NULL, pfm_ita_pmc_check, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, + { PFM_REG_END , 0, 0x0UL, -1UL, NULL, NULL, {0,}, {0,}}, /* end marker */ +}; + +static pfm_reg_desc_t pfm_ita_pmd_desc[PMU_MAX_PMDS]={ +/* pmd0 */ { PFM_REG_BUFFER , 0, 0UL, -1UL, NULL, NULL, {RDEP(1),0UL, 0UL, 0UL}, {RDEP(10),0UL, 0UL, 0UL}}, +/* pmd1 */ { PFM_REG_BUFFER , 0, 0UL, -1UL, NULL, NULL, {RDEP(0),0UL, 0UL, 0UL}, {RDEP(10),0UL, 0UL, 0UL}}, +/* pmd2 */ { PFM_REG_BUFFER , 0, 0UL, -1UL, NULL, NULL, {RDEP(3)|RDEP(17),0UL, 0UL, 0UL}, {RDEP(11),0UL, 0UL, 0UL}}, +/* pmd3 */ { PFM_REG_BUFFER , 0, 0UL, -1UL, NULL, NULL, {RDEP(2)|RDEP(17),0UL, 0UL, 0UL}, {RDEP(11),0UL, 0UL, 0UL}}, +/* pmd4 */ { PFM_REG_COUNTING, 0, 0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {RDEP(4),0UL, 0UL, 0UL}}, +/* pmd5 */ { PFM_REG_COUNTING, 0, 0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {RDEP(5),0UL, 0UL, 0UL}}, +/* pmd6 */ { PFM_REG_COUNTING, 0, 0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {RDEP(6),0UL, 0UL, 0UL}}, +/* pmd7 */ { PFM_REG_COUNTING, 0, 0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {RDEP(7),0UL, 0UL, 0UL}}, +/* pmd8 */ { PFM_REG_BUFFER , 0, 0UL, -1UL, NULL, NULL, {RDEP(9)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}}, +/* pmd9 */ { PFM_REG_BUFFER , 0, 0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}}, +/* pmd10 */ { PFM_REG_BUFFER , 0, 0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}}, +/* pmd11 */ { PFM_REG_BUFFER , 0, 0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}}, +/* pmd12 */ { PFM_REG_BUFFER , 0, 0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|RDEP(13)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}}, +/* pmd13 */ { PFM_REG_BUFFER , 0, 0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}}, +/* pmd14 */ { PFM_REG_BUFFER , 0, 0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}}, +/* pmd15 */ { PFM_REG_BUFFER , 0, 0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}}, +/* pmd16 */ { PFM_REG_BUFFER , 0, 0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}}, +/* pmd17 */ { PFM_REG_BUFFER , 0, 0UL, -1UL, NULL, NULL, {RDEP(2)|RDEP(3),0UL, 0UL, 0UL}, {RDEP(11),0UL, 0UL, 0UL}}, + { PFM_REG_END , 0, 0UL, -1UL, NULL, NULL, {0,}, {0,}}, /* end marker */ +}; + +static int +pfm_ita_pmc_check(struct task_struct *task, pfm_context_t *ctx, unsigned int cnum, unsigned long *val, struct pt_regs *regs) +{ + int ret; + int is_loaded; + + /* sanitfy check */ + if (ctx == NULL) return -EINVAL; + + is_loaded = ctx->ctx_state == PFM_CTX_LOADED || ctx->ctx_state == PFM_CTX_MASKED; + + /* + * we must clear the (instruction) debug registers if pmc13.ta bit is cleared + * before they are written (fl_using_dbreg==0) to avoid picking up stale information. + */ + if (cnum == 13 && is_loaded && ((*val & 0x1) == 0UL) && ctx->ctx_fl_using_dbreg == 0) { + + DPRINT(("pmc[%d]=0x%lx has active pmc13.ta cleared, clearing ibr\n", cnum, *val)); + + /* don't mix debug with perfmon */ + if (task && (task->thread.flags & IA64_THREAD_DBG_VALID) != 0) return -EINVAL; + + /* + * a count of 0 will mark the debug registers as in use and also + * ensure that they are properly cleared. + */ + ret = pfm_write_ibr_dbr(1, ctx, NULL, 0, regs); + if (ret) return ret; + } + + /* + * we must clear the (data) debug registers if pmc11.pt bit is cleared + * before they are written (fl_using_dbreg==0) to avoid picking up stale information. + */ + if (cnum == 11 && is_loaded && ((*val >> 28)& 0x1) == 0 && ctx->ctx_fl_using_dbreg == 0) { + + DPRINT(("pmc[%d]=0x%lx has active pmc11.pt cleared, clearing dbr\n", cnum, *val)); + + /* don't mix debug with perfmon */ + if (task && (task->thread.flags & IA64_THREAD_DBG_VALID) != 0) return -EINVAL; + + /* + * a count of 0 will mark the debug registers as in use and also + * ensure that they are properly cleared. + */ + ret = pfm_write_ibr_dbr(0, ctx, NULL, 0, regs); + if (ret) return ret; + } + return 0; +} + +/* + * impl_pmcs, impl_pmds are computed at runtime to minimize errors! + */ +static pmu_config_t pmu_conf_ita={ + .pmu_name = "Itanium", + .pmu_family = 0x7, + .ovfl_val = (1UL << 32) - 1, + .pmd_desc = pfm_ita_pmd_desc, + .pmc_desc = pfm_ita_pmc_desc, + .num_ibrs = 8, + .num_dbrs = 8, + .use_rr_dbregs = 1, /* debug register are use for range retrictions */ +}; + + diff --git a/arch/ia64/kernel/perfmon_mckinley.h b/arch/ia64/kernel/perfmon_mckinley.h new file mode 100644 index 000000000000..9becccda2897 --- /dev/null +++ b/arch/ia64/kernel/perfmon_mckinley.h @@ -0,0 +1,187 @@ +/* + * This file contains the McKinley PMU register description tables + * and pmc checker used by perfmon.c. + * + * Copyright (C) 2002-2003 Hewlett Packard Co + * Stephane Eranian <eranian@hpl.hp.com> + */ +static int pfm_mck_pmc_check(struct task_struct *task, pfm_context_t *ctx, unsigned int cnum, unsigned long *val, struct pt_regs *regs); + +static pfm_reg_desc_t pfm_mck_pmc_desc[PMU_MAX_PMCS]={ +/* pmc0 */ { PFM_REG_CONTROL , 0, 0x1UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, +/* pmc1 */ { PFM_REG_CONTROL , 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, +/* pmc2 */ { PFM_REG_CONTROL , 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, +/* pmc3 */ { PFM_REG_CONTROL , 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, +/* pmc4 */ { PFM_REG_COUNTING, 6, 0x0000000000800000UL, 0xfffff7fUL, NULL, pfm_mck_pmc_check, {RDEP(4),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, +/* pmc5 */ { PFM_REG_COUNTING, 6, 0x0UL, 0xfffff7fUL, NULL, pfm_mck_pmc_check, {RDEP(5),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, +/* pmc6 */ { PFM_REG_COUNTING, 6, 0x0UL, 0xfffff7fUL, NULL, pfm_mck_pmc_check, {RDEP(6),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, +/* pmc7 */ { PFM_REG_COUNTING, 6, 0x0UL, 0xfffff7fUL, NULL, pfm_mck_pmc_check, {RDEP(7),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, +/* pmc8 */ { PFM_REG_CONFIG , 0, 0xffffffff3fffffffUL, 0xffffffff3ffffffbUL, NULL, pfm_mck_pmc_check, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, +/* pmc9 */ { PFM_REG_CONFIG , 0, 0xffffffff3ffffffcUL, 0xffffffff3ffffffbUL, NULL, pfm_mck_pmc_check, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, +/* pmc10 */ { PFM_REG_MONITOR , 4, 0x0UL, 0xffffUL, NULL, pfm_mck_pmc_check, {RDEP(0)|RDEP(1),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, +/* pmc11 */ { PFM_REG_MONITOR , 6, 0x0UL, 0x30f01cf, NULL, pfm_mck_pmc_check, {RDEP(2)|RDEP(3)|RDEP(17),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, +/* pmc12 */ { PFM_REG_MONITOR , 6, 0x0UL, 0xffffUL, NULL, pfm_mck_pmc_check, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, +/* pmc13 */ { PFM_REG_CONFIG , 0, 0x00002078fefefefeUL, 0x1e00018181818UL, NULL, pfm_mck_pmc_check, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, +/* pmc14 */ { PFM_REG_CONFIG , 0, 0x0db60db60db60db6UL, 0x2492UL, NULL, pfm_mck_pmc_check, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, +/* pmc15 */ { PFM_REG_CONFIG , 0, 0x00000000fffffff0UL, 0xfUL, NULL, pfm_mck_pmc_check, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, + { PFM_REG_END , 0, 0x0UL, -1UL, NULL, NULL, {0,}, {0,}}, /* end marker */ +}; + +static pfm_reg_desc_t pfm_mck_pmd_desc[PMU_MAX_PMDS]={ +/* pmd0 */ { PFM_REG_BUFFER , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(1),0UL, 0UL, 0UL}, {RDEP(10),0UL, 0UL, 0UL}}, +/* pmd1 */ { PFM_REG_BUFFER , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(0),0UL, 0UL, 0UL}, {RDEP(10),0UL, 0UL, 0UL}}, +/* pmd2 */ { PFM_REG_BUFFER , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(3)|RDEP(17),0UL, 0UL, 0UL}, {RDEP(11),0UL, 0UL, 0UL}}, +/* pmd3 */ { PFM_REG_BUFFER , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(2)|RDEP(17),0UL, 0UL, 0UL}, {RDEP(11),0UL, 0UL, 0UL}}, +/* pmd4 */ { PFM_REG_COUNTING, 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {RDEP(4),0UL, 0UL, 0UL}}, +/* pmd5 */ { PFM_REG_COUNTING, 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {RDEP(5),0UL, 0UL, 0UL}}, +/* pmd6 */ { PFM_REG_COUNTING, 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {RDEP(6),0UL, 0UL, 0UL}}, +/* pmd7 */ { PFM_REG_COUNTING, 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {RDEP(7),0UL, 0UL, 0UL}}, +/* pmd8 */ { PFM_REG_BUFFER , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(9)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}}, +/* pmd9 */ { PFM_REG_BUFFER , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}}, +/* pmd10 */ { PFM_REG_BUFFER , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}}, +/* pmd11 */ { PFM_REG_BUFFER , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}}, +/* pmd12 */ { PFM_REG_BUFFER , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|RDEP(13)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}}, +/* pmd13 */ { PFM_REG_BUFFER , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}}, +/* pmd14 */ { PFM_REG_BUFFER , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}}, +/* pmd15 */ { PFM_REG_BUFFER , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}}, +/* pmd16 */ { PFM_REG_BUFFER , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}}, +/* pmd17 */ { PFM_REG_BUFFER , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(2)|RDEP(3),0UL, 0UL, 0UL}, {RDEP(11),0UL, 0UL, 0UL}}, + { PFM_REG_END , 0, 0x0UL, -1UL, NULL, NULL, {0,}, {0,}}, /* end marker */ +}; + +/* + * PMC reserved fields must have their power-up values preserved + */ +static int +pfm_mck_reserved(unsigned int cnum, unsigned long *val, struct pt_regs *regs) +{ + unsigned long tmp1, tmp2, ival = *val; + + /* remove reserved areas from user value */ + tmp1 = ival & PMC_RSVD_MASK(cnum); + + /* get reserved fields values */ + tmp2 = PMC_DFL_VAL(cnum) & ~PMC_RSVD_MASK(cnum); + + *val = tmp1 | tmp2; + + DPRINT(("pmc[%d]=0x%lx, mask=0x%lx, reset=0x%lx, val=0x%lx\n", + cnum, ival, PMC_RSVD_MASK(cnum), PMC_DFL_VAL(cnum), *val)); + return 0; +} + +/* + * task can be NULL if the context is unloaded + */ +static int +pfm_mck_pmc_check(struct task_struct *task, pfm_context_t *ctx, unsigned int cnum, unsigned long *val, struct pt_regs *regs) +{ + int ret = 0, check_case1 = 0; + unsigned long val8 = 0, val14 = 0, val13 = 0; + int is_loaded; + + /* first preserve the reserved fields */ + pfm_mck_reserved(cnum, val, regs); + + /* sanitfy check */ + if (ctx == NULL) return -EINVAL; + + is_loaded = ctx->ctx_state == PFM_CTX_LOADED || ctx->ctx_state == PFM_CTX_MASKED; + + /* + * we must clear the debug registers if pmc13 has a value which enable + * memory pipeline event constraints. In this case we need to clear the + * the debug registers if they have not yet been accessed. This is required + * to avoid picking stale state. + * PMC13 is "active" if: + * one of the pmc13.cfg_dbrpXX field is different from 0x3 + * AND + * at the corresponding pmc13.ena_dbrpXX is set. + */ + DPRINT(("cnum=%u val=0x%lx, using_dbreg=%d loaded=%d\n", cnum, *val, ctx->ctx_fl_using_dbreg, is_loaded)); + + if (cnum == 13 && is_loaded + && (*val & 0x1e00000000000UL) && (*val & 0x18181818UL) != 0x18181818UL && ctx->ctx_fl_using_dbreg == 0) { + + DPRINT(("pmc[%d]=0x%lx has active pmc13 settings, clearing dbr\n", cnum, *val)); + + /* don't mix debug with perfmon */ + if (task && (task->thread.flags & IA64_THREAD_DBG_VALID) != 0) return -EINVAL; + + /* + * a count of 0 will mark the debug registers as in use and also + * ensure that they are properly cleared. + */ + ret = pfm_write_ibr_dbr(PFM_DATA_RR, ctx, NULL, 0, regs); + if (ret) return ret; + } + /* + * we must clear the (instruction) debug registers if any pmc14.ibrpX bit is enabled + * before they are (fl_using_dbreg==0) to avoid picking up stale information. + */ + if (cnum == 14 && is_loaded && ((*val & 0x2222UL) != 0x2222UL) && ctx->ctx_fl_using_dbreg == 0) { + + DPRINT(("pmc[%d]=0x%lx has active pmc14 settings, clearing ibr\n", cnum, *val)); + + /* don't mix debug with perfmon */ + if (task && (task->thread.flags & IA64_THREAD_DBG_VALID) != 0) return -EINVAL; + + /* + * a count of 0 will mark the debug registers as in use and also + * ensure that they are properly cleared. + */ + ret = pfm_write_ibr_dbr(PFM_CODE_RR, ctx, NULL, 0, regs); + if (ret) return ret; + + } + + switch(cnum) { + case 4: *val |= 1UL << 23; /* force power enable bit */ + break; + case 8: val8 = *val; + val13 = ctx->ctx_pmcs[13]; + val14 = ctx->ctx_pmcs[14]; + check_case1 = 1; + break; + case 13: val8 = ctx->ctx_pmcs[8]; + val13 = *val; + val14 = ctx->ctx_pmcs[14]; + check_case1 = 1; + break; + case 14: val8 = ctx->ctx_pmcs[8]; + val13 = ctx->ctx_pmcs[13]; + val14 = *val; + check_case1 = 1; + break; + } + /* check illegal configuration which can produce inconsistencies in tagging + * i-side events in L1D and L2 caches + */ + if (check_case1) { + ret = ((val13 >> 45) & 0xf) == 0 + && ((val8 & 0x1) == 0) + && ((((val14>>1) & 0x3) == 0x2 || ((val14>>1) & 0x3) == 0x0) + ||(((val14>>4) & 0x3) == 0x2 || ((val14>>4) & 0x3) == 0x0)); + + if (ret) DPRINT((KERN_DEBUG "perfmon: failure check_case1\n")); + } + + return ret ? -EINVAL : 0; +} + +/* + * impl_pmcs, impl_pmds are computed at runtime to minimize errors! + */ +static pmu_config_t pmu_conf_mck={ + .pmu_name = "Itanium 2", + .pmu_family = 0x1f, + .flags = PFM_PMU_IRQ_RESEND, + .ovfl_val = (1UL << 47) - 1, + .pmd_desc = pfm_mck_pmd_desc, + .pmc_desc = pfm_mck_pmc_desc, + .num_ibrs = 8, + .num_dbrs = 8, + .use_rr_dbregs = 1 /* debug register are use for range retrictions */ +}; + + diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c new file mode 100644 index 000000000000..91293388dd29 --- /dev/null +++ b/arch/ia64/kernel/process.c @@ -0,0 +1,800 @@ +/* + * Architecture-specific setup. + * + * Copyright (C) 1998-2003 Hewlett-Packard Co + * David Mosberger-Tang <davidm@hpl.hp.com> + */ +#define __KERNEL_SYSCALLS__ /* see <asm/unistd.h> */ +#include <linux/config.h> + +#include <linux/cpu.h> +#include <linux/pm.h> +#include <linux/elf.h> +#include <linux/errno.h> +#include <linux/kallsyms.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/notifier.h> +#include <linux/personality.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/smp_lock.h> +#include <linux/stddef.h> +#include <linux/thread_info.h> +#include <linux/unistd.h> +#include <linux/efi.h> +#include <linux/interrupt.h> +#include <linux/delay.h> + +#include <asm/cpu.h> +#include <asm/delay.h> +#include <asm/elf.h> +#include <asm/ia32.h> +#include <asm/irq.h> +#include <asm/pgalloc.h> +#include <asm/processor.h> +#include <asm/sal.h> +#include <asm/tlbflush.h> +#include <asm/uaccess.h> +#include <asm/unwind.h> +#include <asm/user.h> + +#include "entry.h" + +#ifdef CONFIG_PERFMON +# include <asm/perfmon.h> +#endif + +#include "sigframe.h" + +void (*ia64_mark_idle)(int); +static cpumask_t cpu_idle_map; + +unsigned long boot_option_idle_override = 0; +EXPORT_SYMBOL(boot_option_idle_override); + +void +ia64_do_show_stack (struct unw_frame_info *info, void *arg) +{ + unsigned long ip, sp, bsp; + char buf[128]; /* don't make it so big that it overflows the stack! */ + + printk("\nCall Trace:\n"); + do { + unw_get_ip(info, &ip); + if (ip == 0) + break; + + unw_get_sp(info, &sp); + unw_get_bsp(info, &bsp); + snprintf(buf, sizeof(buf), + " [<%016lx>] %%s\n" + " sp=%016lx bsp=%016lx\n", + ip, sp, bsp); + print_symbol(buf, ip); + } while (unw_unwind(info) >= 0); +} + +void +show_stack (struct task_struct *task, unsigned long *sp) +{ + if (!task) + unw_init_running(ia64_do_show_stack, NULL); + else { + struct unw_frame_info info; + + unw_init_from_blocked_task(&info, task); + ia64_do_show_stack(&info, NULL); + } +} + +void +dump_stack (void) +{ + show_stack(NULL, NULL); +} + +EXPORT_SYMBOL(dump_stack); + +void +show_regs (struct pt_regs *regs) +{ + unsigned long ip = regs->cr_iip + ia64_psr(regs)->ri; + + print_modules(); + printk("\nPid: %d, CPU %d, comm: %20s\n", current->pid, smp_processor_id(), current->comm); + printk("psr : %016lx ifs : %016lx ip : [<%016lx>] %s\n", + regs->cr_ipsr, regs->cr_ifs, ip, print_tainted()); + print_symbol("ip is at %s\n", ip); + printk("unat: %016lx pfs : %016lx rsc : %016lx\n", + regs->ar_unat, regs->ar_pfs, regs->ar_rsc); + printk("rnat: %016lx bsps: %016lx pr : %016lx\n", + regs->ar_rnat, regs->ar_bspstore, regs->pr); + printk("ldrs: %016lx ccv : %016lx fpsr: %016lx\n", + regs->loadrs, regs->ar_ccv, regs->ar_fpsr); + printk("csd : %016lx ssd : %016lx\n", regs->ar_csd, regs->ar_ssd); + printk("b0 : %016lx b6 : %016lx b7 : %016lx\n", regs->b0, regs->b6, regs->b7); + printk("f6 : %05lx%016lx f7 : %05lx%016lx\n", + regs->f6.u.bits[1], regs->f6.u.bits[0], + regs->f7.u.bits[1], regs->f7.u.bits[0]); + printk("f8 : %05lx%016lx f9 : %05lx%016lx\n", + regs->f8.u.bits[1], regs->f8.u.bits[0], + regs->f9.u.bits[1], regs->f9.u.bits[0]); + printk("f10 : %05lx%016lx f11 : %05lx%016lx\n", + regs->f10.u.bits[1], regs->f10.u.bits[0], + regs->f11.u.bits[1], regs->f11.u.bits[0]); + + printk("r1 : %016lx r2 : %016lx r3 : %016lx\n", regs->r1, regs->r2, regs->r3); + printk("r8 : %016lx r9 : %016lx r10 : %016lx\n", regs->r8, regs->r9, regs->r10); + printk("r11 : %016lx r12 : %016lx r13 : %016lx\n", regs->r11, regs->r12, regs->r13); + printk("r14 : %016lx r15 : %016lx r16 : %016lx\n", regs->r14, regs->r15, regs->r16); + printk("r17 : %016lx r18 : %016lx r19 : %016lx\n", regs->r17, regs->r18, regs->r19); + printk("r20 : %016lx r21 : %016lx r22 : %016lx\n", regs->r20, regs->r21, regs->r22); + printk("r23 : %016lx r24 : %016lx r25 : %016lx\n", regs->r23, regs->r24, regs->r25); + printk("r26 : %016lx r27 : %016lx r28 : %016lx\n", regs->r26, regs->r27, regs->r28); + printk("r29 : %016lx r30 : %016lx r31 : %016lx\n", regs->r29, regs->r30, regs->r31); + + if (user_mode(regs)) { + /* print the stacked registers */ + unsigned long val, *bsp, ndirty; + int i, sof, is_nat = 0; + + sof = regs->cr_ifs & 0x7f; /* size of frame */ + ndirty = (regs->loadrs >> 19); + bsp = ia64_rse_skip_regs((unsigned long *) regs->ar_bspstore, ndirty); + for (i = 0; i < sof; ++i) { + get_user(val, (unsigned long __user *) ia64_rse_skip_regs(bsp, i)); + printk("r%-3u:%c%016lx%s", 32 + i, is_nat ? '*' : ' ', val, + ((i == sof - 1) || (i % 3) == 2) ? "\n" : " "); + } + } else + show_stack(NULL, NULL); +} + +void +do_notify_resume_user (sigset_t *oldset, struct sigscratch *scr, long in_syscall) +{ + if (fsys_mode(current, &scr->pt)) { + /* defer signal-handling etc. until we return to privilege-level 0. */ + if (!ia64_psr(&scr->pt)->lp) + ia64_psr(&scr->pt)->lp = 1; + return; + } + +#ifdef CONFIG_PERFMON + if (current->thread.pfm_needs_checking) + pfm_handle_work(); +#endif + + /* deal with pending signal delivery */ + if (test_thread_flag(TIF_SIGPENDING)) + ia64_do_signal(oldset, scr, in_syscall); +} + +static int pal_halt = 1; +static int __init nohalt_setup(char * str) +{ + pal_halt = 0; + return 1; +} +__setup("nohalt", nohalt_setup); + +/* + * We use this if we don't have any better idle routine.. + */ +void +default_idle (void) +{ + unsigned long pmu_active = ia64_getreg(_IA64_REG_PSR) & (IA64_PSR_PP | IA64_PSR_UP); + + while (!need_resched()) + if (pal_halt && !pmu_active) + safe_halt(); + else + cpu_relax(); +} + +#ifdef CONFIG_HOTPLUG_CPU +/* We don't actually take CPU down, just spin without interrupts. */ +static inline void play_dead(void) +{ + extern void ia64_cpu_local_tick (void); + /* Ack it */ + __get_cpu_var(cpu_state) = CPU_DEAD; + + /* We shouldn't have to disable interrupts while dead, but + * some interrupts just don't seem to go away, and this makes + * it "work" for testing purposes. */ + max_xtp(); + local_irq_disable(); + /* Death loop */ + while (__get_cpu_var(cpu_state) != CPU_UP_PREPARE) + cpu_relax(); + + /* + * Enable timer interrupts from now on + * Not required if we put processor in SAL_BOOT_RENDEZ mode. + */ + local_flush_tlb_all(); + cpu_set(smp_processor_id(), cpu_online_map); + wmb(); + ia64_cpu_local_tick (); + local_irq_enable(); +} +#else +static inline void play_dead(void) +{ + BUG(); +} +#endif /* CONFIG_HOTPLUG_CPU */ + + +void cpu_idle_wait(void) +{ + int cpu; + cpumask_t map; + + for_each_online_cpu(cpu) + cpu_set(cpu, cpu_idle_map); + + wmb(); + do { + ssleep(1); + cpus_and(map, cpu_idle_map, cpu_online_map); + } while (!cpus_empty(map)); +} +EXPORT_SYMBOL_GPL(cpu_idle_wait); + +void __attribute__((noreturn)) +cpu_idle (void) +{ + void (*mark_idle)(int) = ia64_mark_idle; + int cpu = smp_processor_id(); + + /* endless idle loop with no priority at all */ + while (1) { +#ifdef CONFIG_SMP + if (!need_resched()) + min_xtp(); +#endif + while (!need_resched()) { + void (*idle)(void); + + if (mark_idle) + (*mark_idle)(1); + + if (cpu_isset(cpu, cpu_idle_map)) + cpu_clear(cpu, cpu_idle_map); + rmb(); + idle = pm_idle; + if (!idle) + idle = default_idle; + (*idle)(); + } + + if (mark_idle) + (*mark_idle)(0); + +#ifdef CONFIG_SMP + normal_xtp(); +#endif + schedule(); + check_pgt_cache(); + if (cpu_is_offline(smp_processor_id())) + play_dead(); + } +} + +void +ia64_save_extra (struct task_struct *task) +{ +#ifdef CONFIG_PERFMON + unsigned long info; +#endif + + if ((task->thread.flags & IA64_THREAD_DBG_VALID) != 0) + ia64_save_debug_regs(&task->thread.dbr[0]); + +#ifdef CONFIG_PERFMON + if ((task->thread.flags & IA64_THREAD_PM_VALID) != 0) + pfm_save_regs(task); + + info = __get_cpu_var(pfm_syst_info); + if (info & PFM_CPUINFO_SYST_WIDE) + pfm_syst_wide_update_task(task, info, 0); +#endif + +#ifdef CONFIG_IA32_SUPPORT + if (IS_IA32_PROCESS(ia64_task_regs(task))) + ia32_save_state(task); +#endif +} + +void +ia64_load_extra (struct task_struct *task) +{ +#ifdef CONFIG_PERFMON + unsigned long info; +#endif + + if ((task->thread.flags & IA64_THREAD_DBG_VALID) != 0) + ia64_load_debug_regs(&task->thread.dbr[0]); + +#ifdef CONFIG_PERFMON + if ((task->thread.flags & IA64_THREAD_PM_VALID) != 0) + pfm_load_regs(task); + + info = __get_cpu_var(pfm_syst_info); + if (info & PFM_CPUINFO_SYST_WIDE) + pfm_syst_wide_update_task(task, info, 1); +#endif + +#ifdef CONFIG_IA32_SUPPORT + if (IS_IA32_PROCESS(ia64_task_regs(task))) + ia32_load_state(task); +#endif +} + +/* + * Copy the state of an ia-64 thread. + * + * We get here through the following call chain: + * + * from user-level: from kernel: + * + * <clone syscall> <some kernel call frames> + * sys_clone : + * do_fork do_fork + * copy_thread copy_thread + * + * This means that the stack layout is as follows: + * + * +---------------------+ (highest addr) + * | struct pt_regs | + * +---------------------+ + * | struct switch_stack | + * +---------------------+ + * | | + * | memory stack | + * | | <-- sp (lowest addr) + * +---------------------+ + * + * Observe that we copy the unat values that are in pt_regs and switch_stack. Spilling an + * integer to address X causes bit N in ar.unat to be set to the NaT bit of the register, + * with N=(X & 0x1ff)/8. Thus, copying the unat value preserves the NaT bits ONLY if the + * pt_regs structure in the parent is congruent to that of the child, modulo 512. Since + * the stack is page aligned and the page size is at least 4KB, this is always the case, + * so there is nothing to worry about. + */ +int +copy_thread (int nr, unsigned long clone_flags, + unsigned long user_stack_base, unsigned long user_stack_size, + struct task_struct *p, struct pt_regs *regs) +{ + extern char ia64_ret_from_clone, ia32_ret_from_clone; + struct switch_stack *child_stack, *stack; + unsigned long rbs, child_rbs, rbs_size; + struct pt_regs *child_ptregs; + int retval = 0; + +#ifdef CONFIG_SMP + /* + * For SMP idle threads, fork_by_hand() calls do_fork with + * NULL regs. + */ + if (!regs) + return 0; +#endif + + stack = ((struct switch_stack *) regs) - 1; + + child_ptregs = (struct pt_regs *) ((unsigned long) p + IA64_STK_OFFSET) - 1; + child_stack = (struct switch_stack *) child_ptregs - 1; + + /* copy parent's switch_stack & pt_regs to child: */ + memcpy(child_stack, stack, sizeof(*child_ptregs) + sizeof(*child_stack)); + + rbs = (unsigned long) current + IA64_RBS_OFFSET; + child_rbs = (unsigned long) p + IA64_RBS_OFFSET; + rbs_size = stack->ar_bspstore - rbs; + + /* copy the parent's register backing store to the child: */ + memcpy((void *) child_rbs, (void *) rbs, rbs_size); + + if (likely(user_mode(child_ptregs))) { + if ((clone_flags & CLONE_SETTLS) && !IS_IA32_PROCESS(regs)) + child_ptregs->r13 = regs->r16; /* see sys_clone2() in entry.S */ + if (user_stack_base) { + child_ptregs->r12 = user_stack_base + user_stack_size - 16; + child_ptregs->ar_bspstore = user_stack_base; + child_ptregs->ar_rnat = 0; + child_ptregs->loadrs = 0; + } + } else { + /* + * Note: we simply preserve the relative position of + * the stack pointer here. There is no need to + * allocate a scratch area here, since that will have + * been taken care of by the caller of sys_clone() + * already. + */ + child_ptregs->r12 = (unsigned long) child_ptregs - 16; /* kernel sp */ + child_ptregs->r13 = (unsigned long) p; /* set `current' pointer */ + } + child_stack->ar_bspstore = child_rbs + rbs_size; + if (IS_IA32_PROCESS(regs)) + child_stack->b0 = (unsigned long) &ia32_ret_from_clone; + else + child_stack->b0 = (unsigned long) &ia64_ret_from_clone; + + /* copy parts of thread_struct: */ + p->thread.ksp = (unsigned long) child_stack - 16; + + /* stop some PSR bits from being inherited. + * the psr.up/psr.pp bits must be cleared on fork but inherited on execve() + * therefore we must specify them explicitly here and not include them in + * IA64_PSR_BITS_TO_CLEAR. + */ + child_ptregs->cr_ipsr = ((child_ptregs->cr_ipsr | IA64_PSR_BITS_TO_SET) + & ~(IA64_PSR_BITS_TO_CLEAR | IA64_PSR_PP | IA64_PSR_UP)); + + /* + * NOTE: The calling convention considers all floating point + * registers in the high partition (fph) to be scratch. Since + * the only way to get to this point is through a system call, + * we know that the values in fph are all dead. Hence, there + * is no need to inherit the fph state from the parent to the + * child and all we have to do is to make sure that + * IA64_THREAD_FPH_VALID is cleared in the child. + * + * XXX We could push this optimization a bit further by + * clearing IA64_THREAD_FPH_VALID on ANY system call. + * However, it's not clear this is worth doing. Also, it + * would be a slight deviation from the normal Linux system + * call behavior where scratch registers are preserved across + * system calls (unless used by the system call itself). + */ +# define THREAD_FLAGS_TO_CLEAR (IA64_THREAD_FPH_VALID | IA64_THREAD_DBG_VALID \ + | IA64_THREAD_PM_VALID) +# define THREAD_FLAGS_TO_SET 0 + p->thread.flags = ((current->thread.flags & ~THREAD_FLAGS_TO_CLEAR) + | THREAD_FLAGS_TO_SET); + ia64_drop_fpu(p); /* don't pick up stale state from a CPU's fph */ +#ifdef CONFIG_IA32_SUPPORT + /* + * If we're cloning an IA32 task then save the IA32 extra + * state from the current task to the new task + */ + if (IS_IA32_PROCESS(ia64_task_regs(current))) { + ia32_save_state(p); + if (clone_flags & CLONE_SETTLS) + retval = ia32_clone_tls(p, child_ptregs); + + /* Copy partially mapped page list */ + if (!retval) + retval = ia32_copy_partial_page_list(p, clone_flags); + } +#endif + +#ifdef CONFIG_PERFMON + if (current->thread.pfm_context) + pfm_inherit(p, child_ptregs); +#endif + return retval; +} + +static void +do_copy_task_regs (struct task_struct *task, struct unw_frame_info *info, void *arg) +{ + unsigned long mask, sp, nat_bits = 0, ip, ar_rnat, urbs_end, cfm; + elf_greg_t *dst = arg; + struct pt_regs *pt; + char nat; + int i; + + memset(dst, 0, sizeof(elf_gregset_t)); /* don't leak any kernel bits to user-level */ + + if (unw_unwind_to_user(info) < 0) + return; + + unw_get_sp(info, &sp); + pt = (struct pt_regs *) (sp + 16); + + urbs_end = ia64_get_user_rbs_end(task, pt, &cfm); + + if (ia64_sync_user_rbs(task, info->sw, pt->ar_bspstore, urbs_end) < 0) + return; + + ia64_peek(task, info->sw, urbs_end, (long) ia64_rse_rnat_addr((long *) urbs_end), + &ar_rnat); + + /* + * coredump format: + * r0-r31 + * NaT bits (for r0-r31; bit N == 1 iff rN is a NaT) + * predicate registers (p0-p63) + * b0-b7 + * ip cfm user-mask + * ar.rsc ar.bsp ar.bspstore ar.rnat + * ar.ccv ar.unat ar.fpsr ar.pfs ar.lc ar.ec + */ + + /* r0 is zero */ + for (i = 1, mask = (1UL << i); i < 32; ++i) { + unw_get_gr(info, i, &dst[i], &nat); + if (nat) + nat_bits |= mask; + mask <<= 1; + } + dst[32] = nat_bits; + unw_get_pr(info, &dst[33]); + + for (i = 0; i < 8; ++i) + unw_get_br(info, i, &dst[34 + i]); + + unw_get_rp(info, &ip); + dst[42] = ip + ia64_psr(pt)->ri; + dst[43] = cfm; + dst[44] = pt->cr_ipsr & IA64_PSR_UM; + + unw_get_ar(info, UNW_AR_RSC, &dst[45]); + /* + * For bsp and bspstore, unw_get_ar() would return the kernel + * addresses, but we need the user-level addresses instead: + */ + dst[46] = urbs_end; /* note: by convention PT_AR_BSP points to the end of the urbs! */ + dst[47] = pt->ar_bspstore; + dst[48] = ar_rnat; + unw_get_ar(info, UNW_AR_CCV, &dst[49]); + unw_get_ar(info, UNW_AR_UNAT, &dst[50]); + unw_get_ar(info, UNW_AR_FPSR, &dst[51]); + dst[52] = pt->ar_pfs; /* UNW_AR_PFS is == to pt->cr_ifs for interrupt frames */ + unw_get_ar(info, UNW_AR_LC, &dst[53]); + unw_get_ar(info, UNW_AR_EC, &dst[54]); + unw_get_ar(info, UNW_AR_CSD, &dst[55]); + unw_get_ar(info, UNW_AR_SSD, &dst[56]); +} + +void +do_dump_task_fpu (struct task_struct *task, struct unw_frame_info *info, void *arg) +{ + elf_fpreg_t *dst = arg; + int i; + + memset(dst, 0, sizeof(elf_fpregset_t)); /* don't leak any "random" bits */ + + if (unw_unwind_to_user(info) < 0) + return; + + /* f0 is 0.0, f1 is 1.0 */ + + for (i = 2; i < 32; ++i) + unw_get_fr(info, i, dst + i); + + ia64_flush_fph(task); + if ((task->thread.flags & IA64_THREAD_FPH_VALID) != 0) + memcpy(dst + 32, task->thread.fph, 96*16); +} + +void +do_copy_regs (struct unw_frame_info *info, void *arg) +{ + do_copy_task_regs(current, info, arg); +} + +void +do_dump_fpu (struct unw_frame_info *info, void *arg) +{ + do_dump_task_fpu(current, info, arg); +} + +int +dump_task_regs(struct task_struct *task, elf_gregset_t *regs) +{ + struct unw_frame_info tcore_info; + + if (current == task) { + unw_init_running(do_copy_regs, regs); + } else { + memset(&tcore_info, 0, sizeof(tcore_info)); + unw_init_from_blocked_task(&tcore_info, task); + do_copy_task_regs(task, &tcore_info, regs); + } + return 1; +} + +void +ia64_elf_core_copy_regs (struct pt_regs *pt, elf_gregset_t dst) +{ + unw_init_running(do_copy_regs, dst); +} + +int +dump_task_fpu (struct task_struct *task, elf_fpregset_t *dst) +{ + struct unw_frame_info tcore_info; + + if (current == task) { + unw_init_running(do_dump_fpu, dst); + } else { + memset(&tcore_info, 0, sizeof(tcore_info)); + unw_init_from_blocked_task(&tcore_info, task); + do_dump_task_fpu(task, &tcore_info, dst); + } + return 1; +} + +int +dump_fpu (struct pt_regs *pt, elf_fpregset_t dst) +{ + unw_init_running(do_dump_fpu, dst); + return 1; /* f0-f31 are always valid so we always return 1 */ +} + +long +sys_execve (char __user *filename, char __user * __user *argv, char __user * __user *envp, + struct pt_regs *regs) +{ + char *fname; + int error; + + fname = getname(filename); + error = PTR_ERR(fname); + if (IS_ERR(fname)) + goto out; + error = do_execve(fname, argv, envp, regs); + putname(fname); +out: + return error; +} + +pid_t +kernel_thread (int (*fn)(void *), void *arg, unsigned long flags) +{ + extern void start_kernel_thread (void); + unsigned long *helper_fptr = (unsigned long *) &start_kernel_thread; + struct { + struct switch_stack sw; + struct pt_regs pt; + } regs; + + memset(®s, 0, sizeof(regs)); + regs.pt.cr_iip = helper_fptr[0]; /* set entry point (IP) */ + regs.pt.r1 = helper_fptr[1]; /* set GP */ + regs.pt.r9 = (unsigned long) fn; /* 1st argument */ + regs.pt.r11 = (unsigned long) arg; /* 2nd argument */ + /* Preserve PSR bits, except for bits 32-34 and 37-45, which we can't read. */ + regs.pt.cr_ipsr = ia64_getreg(_IA64_REG_PSR) | IA64_PSR_BN; + regs.pt.cr_ifs = 1UL << 63; /* mark as valid, empty frame */ + regs.sw.ar_fpsr = regs.pt.ar_fpsr = ia64_getreg(_IA64_REG_AR_FPSR); + regs.sw.ar_bspstore = (unsigned long) current + IA64_RBS_OFFSET; + regs.sw.pr = (1 << PRED_KERNEL_STACK); + return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, ®s.pt, 0, NULL, NULL); +} +EXPORT_SYMBOL(kernel_thread); + +/* This gets called from kernel_thread() via ia64_invoke_thread_helper(). */ +int +kernel_thread_helper (int (*fn)(void *), void *arg) +{ +#ifdef CONFIG_IA32_SUPPORT + if (IS_IA32_PROCESS(ia64_task_regs(current))) { + /* A kernel thread is always a 64-bit process. */ + current->thread.map_base = DEFAULT_MAP_BASE; + current->thread.task_size = DEFAULT_TASK_SIZE; + ia64_set_kr(IA64_KR_IO_BASE, current->thread.old_iob); + ia64_set_kr(IA64_KR_TSSD, current->thread.old_k1); + } +#endif + return (*fn)(arg); +} + +/* + * Flush thread state. This is called when a thread does an execve(). + */ +void +flush_thread (void) +{ + /* drop floating-point and debug-register state if it exists: */ + current->thread.flags &= ~(IA64_THREAD_FPH_VALID | IA64_THREAD_DBG_VALID); + ia64_drop_fpu(current); + if (IS_IA32_PROCESS(ia64_task_regs(current))) + ia32_drop_partial_page_list(current); +} + +/* + * Clean up state associated with current thread. This is called when + * the thread calls exit(). + */ +void +exit_thread (void) +{ + ia64_drop_fpu(current); +#ifdef CONFIG_PERFMON + /* if needed, stop monitoring and flush state to perfmon context */ + if (current->thread.pfm_context) + pfm_exit_thread(current); + + /* free debug register resources */ + if (current->thread.flags & IA64_THREAD_DBG_VALID) + pfm_release_debug_registers(current); +#endif + if (IS_IA32_PROCESS(ia64_task_regs(current))) + ia32_drop_partial_page_list(current); +} + +unsigned long +get_wchan (struct task_struct *p) +{ + struct unw_frame_info info; + unsigned long ip; + int count = 0; + + /* + * Note: p may not be a blocked task (it could be current or + * another process running on some other CPU. Rather than + * trying to determine if p is really blocked, we just assume + * it's blocked and rely on the unwind routines to fail + * gracefully if the process wasn't really blocked after all. + * --davidm 99/12/15 + */ + unw_init_from_blocked_task(&info, p); + do { + if (unw_unwind(&info) < 0) + return 0; + unw_get_ip(&info, &ip); + if (!in_sched_functions(ip)) + return ip; + } while (count++ < 16); + return 0; +} + +void +cpu_halt (void) +{ + pal_power_mgmt_info_u_t power_info[8]; + unsigned long min_power; + int i, min_power_state; + + if (ia64_pal_halt_info(power_info) != 0) + return; + + min_power_state = 0; + min_power = power_info[0].pal_power_mgmt_info_s.power_consumption; + for (i = 1; i < 8; ++i) + if (power_info[i].pal_power_mgmt_info_s.im + && power_info[i].pal_power_mgmt_info_s.power_consumption < min_power) { + min_power = power_info[i].pal_power_mgmt_info_s.power_consumption; + min_power_state = i; + } + + while (1) + ia64_pal_halt(min_power_state); +} + +void +machine_restart (char *restart_cmd) +{ + (*efi.reset_system)(EFI_RESET_WARM, 0, 0, NULL); +} + +EXPORT_SYMBOL(machine_restart); + +void +machine_halt (void) +{ + cpu_halt(); +} + +EXPORT_SYMBOL(machine_halt); + +void +machine_power_off (void) +{ + if (pm_power_off) + pm_power_off(); + machine_halt(); +} + +EXPORT_SYMBOL(machine_power_off); diff --git a/arch/ia64/kernel/ptrace.c b/arch/ia64/kernel/ptrace.c new file mode 100644 index 000000000000..55789fcd7210 --- /dev/null +++ b/arch/ia64/kernel/ptrace.c @@ -0,0 +1,1627 @@ +/* + * Kernel support for the ptrace() and syscall tracing interfaces. + * + * Copyright (C) 1999-2005 Hewlett-Packard Co + * David Mosberger-Tang <davidm@hpl.hp.com> + * + * Derived from the x86 and Alpha versions. + */ +#include <linux/config.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/mm.h> +#include <linux/errno.h> +#include <linux/ptrace.h> +#include <linux/smp_lock.h> +#include <linux/user.h> +#include <linux/security.h> +#include <linux/audit.h> + +#include <asm/pgtable.h> +#include <asm/processor.h> +#include <asm/ptrace_offsets.h> +#include <asm/rse.h> +#include <asm/system.h> +#include <asm/uaccess.h> +#include <asm/unwind.h> +#ifdef CONFIG_PERFMON +#include <asm/perfmon.h> +#endif + +#include "entry.h" + +/* + * Bits in the PSR that we allow ptrace() to change: + * be, up, ac, mfl, mfh (the user mask; five bits total) + * db (debug breakpoint fault; one bit) + * id (instruction debug fault disable; one bit) + * dd (data debug fault disable; one bit) + * ri (restart instruction; two bits) + * is (instruction set; one bit) + */ +#define IPSR_MASK (IA64_PSR_UM | IA64_PSR_DB | IA64_PSR_IS \ + | IA64_PSR_ID | IA64_PSR_DD | IA64_PSR_RI) + +#define MASK(nbits) ((1UL << (nbits)) - 1) /* mask with NBITS bits set */ +#define PFM_MASK MASK(38) + +#define PTRACE_DEBUG 0 + +#if PTRACE_DEBUG +# define dprintk(format...) printk(format) +# define inline +#else +# define dprintk(format...) +#endif + +/* Return TRUE if PT was created due to kernel-entry via a system-call. */ + +static inline int +in_syscall (struct pt_regs *pt) +{ + return (long) pt->cr_ifs >= 0; +} + +/* + * Collect the NaT bits for r1-r31 from scratch_unat and return a NaT + * bitset where bit i is set iff the NaT bit of register i is set. + */ +unsigned long +ia64_get_scratch_nat_bits (struct pt_regs *pt, unsigned long scratch_unat) +{ +# define GET_BITS(first, last, unat) \ + ({ \ + unsigned long bit = ia64_unat_pos(&pt->r##first); \ + unsigned long nbits = (last - first + 1); \ + unsigned long mask = MASK(nbits) << first; \ + unsigned long dist; \ + if (bit < first) \ + dist = 64 + bit - first; \ + else \ + dist = bit - first; \ + ia64_rotr(unat, dist) & mask; \ + }) + unsigned long val; + + /* + * Registers that are stored consecutively in struct pt_regs + * can be handled in parallel. If the register order in + * struct_pt_regs changes, this code MUST be updated. + */ + val = GET_BITS( 1, 1, scratch_unat); + val |= GET_BITS( 2, 3, scratch_unat); + val |= GET_BITS(12, 13, scratch_unat); + val |= GET_BITS(14, 14, scratch_unat); + val |= GET_BITS(15, 15, scratch_unat); + val |= GET_BITS( 8, 11, scratch_unat); + val |= GET_BITS(16, 31, scratch_unat); + return val; + +# undef GET_BITS +} + +/* + * Set the NaT bits for the scratch registers according to NAT and + * return the resulting unat (assuming the scratch registers are + * stored in PT). + */ +unsigned long +ia64_put_scratch_nat_bits (struct pt_regs *pt, unsigned long nat) +{ +# define PUT_BITS(first, last, nat) \ + ({ \ + unsigned long bit = ia64_unat_pos(&pt->r##first); \ + unsigned long nbits = (last - first + 1); \ + unsigned long mask = MASK(nbits) << first; \ + long dist; \ + if (bit < first) \ + dist = 64 + bit - first; \ + else \ + dist = bit - first; \ + ia64_rotl(nat & mask, dist); \ + }) + unsigned long scratch_unat; + + /* + * Registers that are stored consecutively in struct pt_regs + * can be handled in parallel. If the register order in + * struct_pt_regs changes, this code MUST be updated. + */ + scratch_unat = PUT_BITS( 1, 1, nat); + scratch_unat |= PUT_BITS( 2, 3, nat); + scratch_unat |= PUT_BITS(12, 13, nat); + scratch_unat |= PUT_BITS(14, 14, nat); + scratch_unat |= PUT_BITS(15, 15, nat); + scratch_unat |= PUT_BITS( 8, 11, nat); + scratch_unat |= PUT_BITS(16, 31, nat); + + return scratch_unat; + +# undef PUT_BITS +} + +#define IA64_MLX_TEMPLATE 0x2 +#define IA64_MOVL_OPCODE 6 + +void +ia64_increment_ip (struct pt_regs *regs) +{ + unsigned long w0, ri = ia64_psr(regs)->ri + 1; + + if (ri > 2) { + ri = 0; + regs->cr_iip += 16; + } else if (ri == 2) { + get_user(w0, (char __user *) regs->cr_iip + 0); + if (((w0 >> 1) & 0xf) == IA64_MLX_TEMPLATE) { + /* + * rfi'ing to slot 2 of an MLX bundle causes + * an illegal operation fault. We don't want + * that to happen... + */ + ri = 0; + regs->cr_iip += 16; + } + } + ia64_psr(regs)->ri = ri; +} + +void +ia64_decrement_ip (struct pt_regs *regs) +{ + unsigned long w0, ri = ia64_psr(regs)->ri - 1; + + if (ia64_psr(regs)->ri == 0) { + regs->cr_iip -= 16; + ri = 2; + get_user(w0, (char __user *) regs->cr_iip + 0); + if (((w0 >> 1) & 0xf) == IA64_MLX_TEMPLATE) { + /* + * rfi'ing to slot 2 of an MLX bundle causes + * an illegal operation fault. We don't want + * that to happen... + */ + ri = 1; + } + } + ia64_psr(regs)->ri = ri; +} + +/* + * This routine is used to read an rnat bits that are stored on the + * kernel backing store. Since, in general, the alignment of the user + * and kernel are different, this is not completely trivial. In + * essence, we need to construct the user RNAT based on up to two + * kernel RNAT values and/or the RNAT value saved in the child's + * pt_regs. + * + * user rbs + * + * +--------+ <-- lowest address + * | slot62 | + * +--------+ + * | rnat | 0x....1f8 + * +--------+ + * | slot00 | \ + * +--------+ | + * | slot01 | > child_regs->ar_rnat + * +--------+ | + * | slot02 | / kernel rbs + * +--------+ +--------+ + * <- child_regs->ar_bspstore | slot61 | <-- krbs + * +- - - - + +--------+ + * | slot62 | + * +- - - - + +--------+ + * | rnat | + * +- - - - + +--------+ + * vrnat | slot00 | + * +- - - - + +--------+ + * = = + * +--------+ + * | slot00 | \ + * +--------+ | + * | slot01 | > child_stack->ar_rnat + * +--------+ | + * | slot02 | / + * +--------+ + * <--- child_stack->ar_bspstore + * + * The way to think of this code is as follows: bit 0 in the user rnat + * corresponds to some bit N (0 <= N <= 62) in one of the kernel rnat + * value. The kernel rnat value holding this bit is stored in + * variable rnat0. rnat1 is loaded with the kernel rnat value that + * form the upper bits of the user rnat value. + * + * Boundary cases: + * + * o when reading the rnat "below" the first rnat slot on the kernel + * backing store, rnat0/rnat1 are set to 0 and the low order bits are + * merged in from pt->ar_rnat. + * + * o when reading the rnat "above" the last rnat slot on the kernel + * backing store, rnat0/rnat1 gets its value from sw->ar_rnat. + */ +static unsigned long +get_rnat (struct task_struct *task, struct switch_stack *sw, + unsigned long *krbs, unsigned long *urnat_addr, + unsigned long *urbs_end) +{ + unsigned long rnat0 = 0, rnat1 = 0, urnat = 0, *slot0_kaddr; + unsigned long umask = 0, mask, m; + unsigned long *kbsp, *ubspstore, *rnat0_kaddr, *rnat1_kaddr, shift; + long num_regs, nbits; + struct pt_regs *pt; + + pt = ia64_task_regs(task); + kbsp = (unsigned long *) sw->ar_bspstore; + ubspstore = (unsigned long *) pt->ar_bspstore; + + if (urbs_end < urnat_addr) + nbits = ia64_rse_num_regs(urnat_addr - 63, urbs_end); + else + nbits = 63; + mask = MASK(nbits); + /* + * First, figure out which bit number slot 0 in user-land maps + * to in the kernel rnat. Do this by figuring out how many + * register slots we're beyond the user's backingstore and + * then computing the equivalent address in kernel space. + */ + num_regs = ia64_rse_num_regs(ubspstore, urnat_addr + 1); + slot0_kaddr = ia64_rse_skip_regs(krbs, num_regs); + shift = ia64_rse_slot_num(slot0_kaddr); + rnat1_kaddr = ia64_rse_rnat_addr(slot0_kaddr); + rnat0_kaddr = rnat1_kaddr - 64; + + if (ubspstore + 63 > urnat_addr) { + /* some bits need to be merged in from pt->ar_rnat */ + umask = MASK(ia64_rse_slot_num(ubspstore)) & mask; + urnat = (pt->ar_rnat & umask); + mask &= ~umask; + if (!mask) + return urnat; + } + + m = mask << shift; + if (rnat0_kaddr >= kbsp) + rnat0 = sw->ar_rnat; + else if (rnat0_kaddr > krbs) + rnat0 = *rnat0_kaddr; + urnat |= (rnat0 & m) >> shift; + + m = mask >> (63 - shift); + if (rnat1_kaddr >= kbsp) + rnat1 = sw->ar_rnat; + else if (rnat1_kaddr > krbs) + rnat1 = *rnat1_kaddr; + urnat |= (rnat1 & m) << (63 - shift); + return urnat; +} + +/* + * The reverse of get_rnat. + */ +static void +put_rnat (struct task_struct *task, struct switch_stack *sw, + unsigned long *krbs, unsigned long *urnat_addr, unsigned long urnat, + unsigned long *urbs_end) +{ + unsigned long rnat0 = 0, rnat1 = 0, *slot0_kaddr, umask = 0, mask, m; + unsigned long *kbsp, *ubspstore, *rnat0_kaddr, *rnat1_kaddr, shift; + long num_regs, nbits; + struct pt_regs *pt; + unsigned long cfm, *urbs_kargs; + + pt = ia64_task_regs(task); + kbsp = (unsigned long *) sw->ar_bspstore; + ubspstore = (unsigned long *) pt->ar_bspstore; + + urbs_kargs = urbs_end; + if (in_syscall(pt)) { + /* + * If entered via syscall, don't allow user to set rnat bits + * for syscall args. + */ + cfm = pt->cr_ifs; + urbs_kargs = ia64_rse_skip_regs(urbs_end, -(cfm & 0x7f)); + } + + if (urbs_kargs >= urnat_addr) + nbits = 63; + else { + if ((urnat_addr - 63) >= urbs_kargs) + return; + nbits = ia64_rse_num_regs(urnat_addr - 63, urbs_kargs); + } + mask = MASK(nbits); + + /* + * First, figure out which bit number slot 0 in user-land maps + * to in the kernel rnat. Do this by figuring out how many + * register slots we're beyond the user's backingstore and + * then computing the equivalent address in kernel space. + */ + num_regs = ia64_rse_num_regs(ubspstore, urnat_addr + 1); + slot0_kaddr = ia64_rse_skip_regs(krbs, num_regs); + shift = ia64_rse_slot_num(slot0_kaddr); + rnat1_kaddr = ia64_rse_rnat_addr(slot0_kaddr); + rnat0_kaddr = rnat1_kaddr - 64; + + if (ubspstore + 63 > urnat_addr) { + /* some bits need to be place in pt->ar_rnat: */ + umask = MASK(ia64_rse_slot_num(ubspstore)) & mask; + pt->ar_rnat = (pt->ar_rnat & ~umask) | (urnat & umask); + mask &= ~umask; + if (!mask) + return; + } + /* + * Note: Section 11.1 of the EAS guarantees that bit 63 of an + * rnat slot is ignored. so we don't have to clear it here. + */ + rnat0 = (urnat << shift); + m = mask << shift; + if (rnat0_kaddr >= kbsp) + sw->ar_rnat = (sw->ar_rnat & ~m) | (rnat0 & m); + else if (rnat0_kaddr > krbs) + *rnat0_kaddr = ((*rnat0_kaddr & ~m) | (rnat0 & m)); + + rnat1 = (urnat >> (63 - shift)); + m = mask >> (63 - shift); + if (rnat1_kaddr >= kbsp) + sw->ar_rnat = (sw->ar_rnat & ~m) | (rnat1 & m); + else if (rnat1_kaddr > krbs) + *rnat1_kaddr = ((*rnat1_kaddr & ~m) | (rnat1 & m)); +} + +static inline int +on_kernel_rbs (unsigned long addr, unsigned long bspstore, + unsigned long urbs_end) +{ + unsigned long *rnat_addr = ia64_rse_rnat_addr((unsigned long *) + urbs_end); + return (addr >= bspstore && addr <= (unsigned long) rnat_addr); +} + +/* + * Read a word from the user-level backing store of task CHILD. ADDR + * is the user-level address to read the word from, VAL a pointer to + * the return value, and USER_BSP gives the end of the user-level + * backing store (i.e., it's the address that would be in ar.bsp after + * the user executed a "cover" instruction). + * + * This routine takes care of accessing the kernel register backing + * store for those registers that got spilled there. It also takes + * care of calculating the appropriate RNaT collection words. + */ +long +ia64_peek (struct task_struct *child, struct switch_stack *child_stack, + unsigned long user_rbs_end, unsigned long addr, long *val) +{ + unsigned long *bspstore, *krbs, regnum, *laddr, *urbs_end, *rnat_addr; + struct pt_regs *child_regs; + size_t copied; + long ret; + + urbs_end = (long *) user_rbs_end; + laddr = (unsigned long *) addr; + child_regs = ia64_task_regs(child); + bspstore = (unsigned long *) child_regs->ar_bspstore; + krbs = (unsigned long *) child + IA64_RBS_OFFSET/8; + if (on_kernel_rbs(addr, (unsigned long) bspstore, + (unsigned long) urbs_end)) + { + /* + * Attempt to read the RBS in an area that's actually + * on the kernel RBS => read the corresponding bits in + * the kernel RBS. + */ + rnat_addr = ia64_rse_rnat_addr(laddr); + ret = get_rnat(child, child_stack, krbs, rnat_addr, urbs_end); + + if (laddr == rnat_addr) { + /* return NaT collection word itself */ + *val = ret; + return 0; + } + + if (((1UL << ia64_rse_slot_num(laddr)) & ret) != 0) { + /* + * It is implementation dependent whether the + * data portion of a NaT value gets saved on a + * st8.spill or RSE spill (e.g., see EAS 2.6, + * 4.4.4.6 Register Spill and Fill). To get + * consistent behavior across all possible + * IA-64 implementations, we return zero in + * this case. + */ + *val = 0; + return 0; + } + + if (laddr < urbs_end) { + /* + * The desired word is on the kernel RBS and + * is not a NaT. + */ + regnum = ia64_rse_num_regs(bspstore, laddr); + *val = *ia64_rse_skip_regs(krbs, regnum); + return 0; + } + } + copied = access_process_vm(child, addr, &ret, sizeof(ret), 0); + if (copied != sizeof(ret)) + return -EIO; + *val = ret; + return 0; +} + +long +ia64_poke (struct task_struct *child, struct switch_stack *child_stack, + unsigned long user_rbs_end, unsigned long addr, long val) +{ + unsigned long *bspstore, *krbs, regnum, *laddr; + unsigned long *urbs_end = (long *) user_rbs_end; + struct pt_regs *child_regs; + + laddr = (unsigned long *) addr; + child_regs = ia64_task_regs(child); + bspstore = (unsigned long *) child_regs->ar_bspstore; + krbs = (unsigned long *) child + IA64_RBS_OFFSET/8; + if (on_kernel_rbs(addr, (unsigned long) bspstore, + (unsigned long) urbs_end)) + { + /* + * Attempt to write the RBS in an area that's actually + * on the kernel RBS => write the corresponding bits + * in the kernel RBS. + */ + if (ia64_rse_is_rnat_slot(laddr)) + put_rnat(child, child_stack, krbs, laddr, val, + urbs_end); + else { + if (laddr < urbs_end) { + regnum = ia64_rse_num_regs(bspstore, laddr); + *ia64_rse_skip_regs(krbs, regnum) = val; + } + } + } else if (access_process_vm(child, addr, &val, sizeof(val), 1) + != sizeof(val)) + return -EIO; + return 0; +} + +/* + * Calculate the address of the end of the user-level register backing + * store. This is the address that would have been stored in ar.bsp + * if the user had executed a "cover" instruction right before + * entering the kernel. If CFMP is not NULL, it is used to return the + * "current frame mask" that was active at the time the kernel was + * entered. + */ +unsigned long +ia64_get_user_rbs_end (struct task_struct *child, struct pt_regs *pt, + unsigned long *cfmp) +{ + unsigned long *krbs, *bspstore, cfm = pt->cr_ifs; + long ndirty; + + krbs = (unsigned long *) child + IA64_RBS_OFFSET/8; + bspstore = (unsigned long *) pt->ar_bspstore; + ndirty = ia64_rse_num_regs(krbs, krbs + (pt->loadrs >> 19)); + + if (in_syscall(pt)) + ndirty += (cfm & 0x7f); + else + cfm &= ~(1UL << 63); /* clear valid bit */ + + if (cfmp) + *cfmp = cfm; + return (unsigned long) ia64_rse_skip_regs(bspstore, ndirty); +} + +/* + * Synchronize (i.e, write) the RSE backing store living in kernel + * space to the VM of the CHILD task. SW and PT are the pointers to + * the switch_stack and pt_regs structures, respectively. + * USER_RBS_END is the user-level address at which the backing store + * ends. + */ +long +ia64_sync_user_rbs (struct task_struct *child, struct switch_stack *sw, + unsigned long user_rbs_start, unsigned long user_rbs_end) +{ + unsigned long addr, val; + long ret; + + /* now copy word for word from kernel rbs to user rbs: */ + for (addr = user_rbs_start; addr < user_rbs_end; addr += 8) { + ret = ia64_peek(child, sw, user_rbs_end, addr, &val); + if (ret < 0) + return ret; + if (access_process_vm(child, addr, &val, sizeof(val), 1) + != sizeof(val)) + return -EIO; + } + return 0; +} + +static inline int +thread_matches (struct task_struct *thread, unsigned long addr) +{ + unsigned long thread_rbs_end; + struct pt_regs *thread_regs; + + if (ptrace_check_attach(thread, 0) < 0) + /* + * If the thread is not in an attachable state, we'll + * ignore it. The net effect is that if ADDR happens + * to overlap with the portion of the thread's + * register backing store that is currently residing + * on the thread's kernel stack, then ptrace() may end + * up accessing a stale value. But if the thread + * isn't stopped, that's a problem anyhow, so we're + * doing as well as we can... + */ + return 0; + + thread_regs = ia64_task_regs(thread); + thread_rbs_end = ia64_get_user_rbs_end(thread, thread_regs, NULL); + if (!on_kernel_rbs(addr, thread_regs->ar_bspstore, thread_rbs_end)) + return 0; + + return 1; /* looks like we've got a winner */ +} + +/* + * GDB apparently wants to be able to read the register-backing store + * of any thread when attached to a given process. If we are peeking + * or poking an address that happens to reside in the kernel-backing + * store of another thread, we need to attach to that thread, because + * otherwise we end up accessing stale data. + * + * task_list_lock must be read-locked before calling this routine! + */ +static struct task_struct * +find_thread_for_addr (struct task_struct *child, unsigned long addr) +{ + struct task_struct *g, *p; + struct mm_struct *mm; + int mm_users; + + if (!(mm = get_task_mm(child))) + return child; + + /* -1 because of our get_task_mm(): */ + mm_users = atomic_read(&mm->mm_users) - 1; + if (mm_users <= 1) + goto out; /* not multi-threaded */ + + /* + * First, traverse the child's thread-list. Good for scalability with + * NPTL-threads. + */ + p = child; + do { + if (thread_matches(p, addr)) { + child = p; + goto out; + } + if (mm_users-- <= 1) + goto out; + } while ((p = next_thread(p)) != child); + + do_each_thread(g, p) { + if (child->mm != mm) + continue; + + if (thread_matches(p, addr)) { + child = p; + goto out; + } + } while_each_thread(g, p); + out: + mmput(mm); + return child; +} + +/* + * Write f32-f127 back to task->thread.fph if it has been modified. + */ +inline void +ia64_flush_fph (struct task_struct *task) +{ + struct ia64_psr *psr = ia64_psr(ia64_task_regs(task)); + + if (ia64_is_local_fpu_owner(task) && psr->mfh) { + psr->mfh = 0; + task->thread.flags |= IA64_THREAD_FPH_VALID; + ia64_save_fpu(&task->thread.fph[0]); + } +} + +/* + * Sync the fph state of the task so that it can be manipulated + * through thread.fph. If necessary, f32-f127 are written back to + * thread.fph or, if the fph state hasn't been used before, thread.fph + * is cleared to zeroes. Also, access to f32-f127 is disabled to + * ensure that the task picks up the state from thread.fph when it + * executes again. + */ +void +ia64_sync_fph (struct task_struct *task) +{ + struct ia64_psr *psr = ia64_psr(ia64_task_regs(task)); + + ia64_flush_fph(task); + if (!(task->thread.flags & IA64_THREAD_FPH_VALID)) { + task->thread.flags |= IA64_THREAD_FPH_VALID; + memset(&task->thread.fph, 0, sizeof(task->thread.fph)); + } + ia64_drop_fpu(task); + psr->dfh = 1; +} + +static int +access_fr (struct unw_frame_info *info, int regnum, int hi, + unsigned long *data, int write_access) +{ + struct ia64_fpreg fpval; + int ret; + + ret = unw_get_fr(info, regnum, &fpval); + if (ret < 0) + return ret; + + if (write_access) { + fpval.u.bits[hi] = *data; + ret = unw_set_fr(info, regnum, fpval); + } else + *data = fpval.u.bits[hi]; + return ret; +} + +/* + * Change the machine-state of CHILD such that it will return via the normal + * kernel exit-path, rather than the syscall-exit path. + */ +static void +convert_to_non_syscall (struct task_struct *child, struct pt_regs *pt, + unsigned long cfm) +{ + struct unw_frame_info info, prev_info; + unsigned long ip, pr; + + unw_init_from_blocked_task(&info, child); + while (1) { + prev_info = info; + if (unw_unwind(&info) < 0) + return; + if (unw_get_rp(&info, &ip) < 0) + return; + if (ip < FIXADDR_USER_END) + break; + } + + unw_get_pr(&prev_info, &pr); + pr &= ~(1UL << PRED_SYSCALL); + pr |= (1UL << PRED_NON_SYSCALL); + unw_set_pr(&prev_info, pr); + + pt->cr_ifs = (1UL << 63) | cfm; +} + +static int +access_nat_bits (struct task_struct *child, struct pt_regs *pt, + struct unw_frame_info *info, + unsigned long *data, int write_access) +{ + unsigned long regnum, nat_bits, scratch_unat, dummy = 0; + char nat = 0; + + if (write_access) { + nat_bits = *data; + scratch_unat = ia64_put_scratch_nat_bits(pt, nat_bits); + if (unw_set_ar(info, UNW_AR_UNAT, scratch_unat) < 0) { + dprintk("ptrace: failed to set ar.unat\n"); + return -1; + } + for (regnum = 4; regnum <= 7; ++regnum) { + unw_get_gr(info, regnum, &dummy, &nat); + unw_set_gr(info, regnum, dummy, + (nat_bits >> regnum) & 1); + } + } else { + if (unw_get_ar(info, UNW_AR_UNAT, &scratch_unat) < 0) { + dprintk("ptrace: failed to read ar.unat\n"); + return -1; + } + nat_bits = ia64_get_scratch_nat_bits(pt, scratch_unat); + for (regnum = 4; regnum <= 7; ++regnum) { + unw_get_gr(info, regnum, &dummy, &nat); + nat_bits |= (nat != 0) << regnum; + } + *data = nat_bits; + } + return 0; +} + +static int +access_uarea (struct task_struct *child, unsigned long addr, + unsigned long *data, int write_access) +{ + unsigned long *ptr, regnum, urbs_end, rnat_addr, cfm; + struct switch_stack *sw; + struct pt_regs *pt; +# define pt_reg_addr(pt, reg) ((void *) \ + ((unsigned long) (pt) \ + + offsetof(struct pt_regs, reg))) + + + pt = ia64_task_regs(child); + sw = (struct switch_stack *) (child->thread.ksp + 16); + + if ((addr & 0x7) != 0) { + dprintk("ptrace: unaligned register address 0x%lx\n", addr); + return -1; + } + + if (addr < PT_F127 + 16) { + /* accessing fph */ + if (write_access) + ia64_sync_fph(child); + else + ia64_flush_fph(child); + ptr = (unsigned long *) + ((unsigned long) &child->thread.fph + addr); + } else if ((addr >= PT_F10) && (addr < PT_F11 + 16)) { + /* scratch registers untouched by kernel (saved in pt_regs) */ + ptr = pt_reg_addr(pt, f10) + (addr - PT_F10); + } else if (addr >= PT_F12 && addr < PT_F15 + 16) { + /* + * Scratch registers untouched by kernel (saved in + * switch_stack). + */ + ptr = (unsigned long *) ((long) sw + + (addr - PT_NAT_BITS - 32)); + } else if (addr < PT_AR_LC + 8) { + /* preserved state: */ + struct unw_frame_info info; + char nat = 0; + int ret; + + unw_init_from_blocked_task(&info, child); + if (unw_unwind_to_user(&info) < 0) + return -1; + + switch (addr) { + case PT_NAT_BITS: + return access_nat_bits(child, pt, &info, + data, write_access); + + case PT_R4: case PT_R5: case PT_R6: case PT_R7: + if (write_access) { + /* read NaT bit first: */ + unsigned long dummy; + + ret = unw_get_gr(&info, (addr - PT_R4)/8 + 4, + &dummy, &nat); + if (ret < 0) + return ret; + } + return unw_access_gr(&info, (addr - PT_R4)/8 + 4, data, + &nat, write_access); + + case PT_B1: case PT_B2: case PT_B3: + case PT_B4: case PT_B5: + return unw_access_br(&info, (addr - PT_B1)/8 + 1, data, + write_access); + + case PT_AR_EC: + return unw_access_ar(&info, UNW_AR_EC, data, + write_access); + + case PT_AR_LC: + return unw_access_ar(&info, UNW_AR_LC, data, + write_access); + + default: + if (addr >= PT_F2 && addr < PT_F5 + 16) + return access_fr(&info, (addr - PT_F2)/16 + 2, + (addr & 8) != 0, data, + write_access); + else if (addr >= PT_F16 && addr < PT_F31 + 16) + return access_fr(&info, + (addr - PT_F16)/16 + 16, + (addr & 8) != 0, + data, write_access); + else { + dprintk("ptrace: rejecting access to register " + "address 0x%lx\n", addr); + return -1; + } + } + } else if (addr < PT_F9+16) { + /* scratch state */ + switch (addr) { + case PT_AR_BSP: + /* + * By convention, we use PT_AR_BSP to refer to + * the end of the user-level backing store. + * Use ia64_rse_skip_regs(PT_AR_BSP, -CFM.sof) + * to get the real value of ar.bsp at the time + * the kernel was entered. + * + * Furthermore, when changing the contents of + * PT_AR_BSP (or PT_CFM) we MUST copy any + * users-level stacked registers that are + * stored on the kernel stack back to + * user-space because otherwise, we might end + * up clobbering kernel stacked registers. + * Also, if this happens while the task is + * blocked in a system call, which convert the + * state such that the non-system-call exit + * path is used. This ensures that the proper + * state will be picked up when resuming + * execution. However, it *also* means that + * once we write PT_AR_BSP/PT_CFM, it won't be + * possible to modify the syscall arguments of + * the pending system call any longer. This + * shouldn't be an issue because modifying + * PT_AR_BSP/PT_CFM generally implies that + * we're either abandoning the pending system + * call or that we defer it's re-execution + * (e.g., due to GDB doing an inferior + * function call). + */ + urbs_end = ia64_get_user_rbs_end(child, pt, &cfm); + if (write_access) { + if (*data != urbs_end) { + if (ia64_sync_user_rbs(child, sw, + pt->ar_bspstore, + urbs_end) < 0) + return -1; + if (in_syscall(pt)) + convert_to_non_syscall(child, + pt, + cfm); + /* + * Simulate user-level write + * of ar.bsp: + */ + pt->loadrs = 0; + pt->ar_bspstore = *data; + } + } else + *data = urbs_end; + return 0; + + case PT_CFM: + urbs_end = ia64_get_user_rbs_end(child, pt, &cfm); + if (write_access) { + if (((cfm ^ *data) & PFM_MASK) != 0) { + if (ia64_sync_user_rbs(child, sw, + pt->ar_bspstore, + urbs_end) < 0) + return -1; + if (in_syscall(pt)) + convert_to_non_syscall(child, + pt, + cfm); + pt->cr_ifs = ((pt->cr_ifs & ~PFM_MASK) + | (*data & PFM_MASK)); + } + } else + *data = cfm; + return 0; + + case PT_CR_IPSR: + if (write_access) + pt->cr_ipsr = ((*data & IPSR_MASK) + | (pt->cr_ipsr & ~IPSR_MASK)); + else + *data = (pt->cr_ipsr & IPSR_MASK); + return 0; + + case PT_AR_RNAT: + urbs_end = ia64_get_user_rbs_end(child, pt, NULL); + rnat_addr = (long) ia64_rse_rnat_addr((long *) + urbs_end); + if (write_access) + return ia64_poke(child, sw, urbs_end, + rnat_addr, *data); + else + return ia64_peek(child, sw, urbs_end, + rnat_addr, data); + + case PT_R1: + ptr = pt_reg_addr(pt, r1); + break; + case PT_R2: case PT_R3: + ptr = pt_reg_addr(pt, r2) + (addr - PT_R2); + break; + case PT_R8: case PT_R9: case PT_R10: case PT_R11: + ptr = pt_reg_addr(pt, r8) + (addr - PT_R8); + break; + case PT_R12: case PT_R13: + ptr = pt_reg_addr(pt, r12) + (addr - PT_R12); + break; + case PT_R14: + ptr = pt_reg_addr(pt, r14); + break; + case PT_R15: + ptr = pt_reg_addr(pt, r15); + break; + case PT_R16: case PT_R17: case PT_R18: case PT_R19: + case PT_R20: case PT_R21: case PT_R22: case PT_R23: + case PT_R24: case PT_R25: case PT_R26: case PT_R27: + case PT_R28: case PT_R29: case PT_R30: case PT_R31: + ptr = pt_reg_addr(pt, r16) + (addr - PT_R16); + break; + case PT_B0: + ptr = pt_reg_addr(pt, b0); + break; + case PT_B6: + ptr = pt_reg_addr(pt, b6); + break; + case PT_B7: + ptr = pt_reg_addr(pt, b7); + break; + case PT_F6: case PT_F6+8: case PT_F7: case PT_F7+8: + case PT_F8: case PT_F8+8: case PT_F9: case PT_F9+8: + ptr = pt_reg_addr(pt, f6) + (addr - PT_F6); + break; + case PT_AR_BSPSTORE: + ptr = pt_reg_addr(pt, ar_bspstore); + break; + case PT_AR_RSC: + ptr = pt_reg_addr(pt, ar_rsc); + break; + case PT_AR_UNAT: + ptr = pt_reg_addr(pt, ar_unat); + break; + case PT_AR_PFS: + ptr = pt_reg_addr(pt, ar_pfs); + break; + case PT_AR_CCV: + ptr = pt_reg_addr(pt, ar_ccv); + break; + case PT_AR_FPSR: + ptr = pt_reg_addr(pt, ar_fpsr); + break; + case PT_CR_IIP: + ptr = pt_reg_addr(pt, cr_iip); + break; + case PT_PR: + ptr = pt_reg_addr(pt, pr); + break; + /* scratch register */ + + default: + /* disallow accessing anything else... */ + dprintk("ptrace: rejecting access to register " + "address 0x%lx\n", addr); + return -1; + } + } else if (addr <= PT_AR_SSD) { + ptr = pt_reg_addr(pt, ar_csd) + (addr - PT_AR_CSD); + } else { + /* access debug registers */ + + if (addr >= PT_IBR) { + regnum = (addr - PT_IBR) >> 3; + ptr = &child->thread.ibr[0]; + } else { + regnum = (addr - PT_DBR) >> 3; + ptr = &child->thread.dbr[0]; + } + + if (regnum >= 8) { + dprintk("ptrace: rejecting access to register " + "address 0x%lx\n", addr); + return -1; + } +#ifdef CONFIG_PERFMON + /* + * Check if debug registers are used by perfmon. This + * test must be done once we know that we can do the + * operation, i.e. the arguments are all valid, but + * before we start modifying the state. + * + * Perfmon needs to keep a count of how many processes + * are trying to modify the debug registers for system + * wide monitoring sessions. + * + * We also include read access here, because they may + * cause the PMU-installed debug register state + * (dbr[], ibr[]) to be reset. The two arrays are also + * used by perfmon, but we do not use + * IA64_THREAD_DBG_VALID. The registers are restored + * by the PMU context switch code. + */ + if (pfm_use_debug_registers(child)) return -1; +#endif + + if (!(child->thread.flags & IA64_THREAD_DBG_VALID)) { + child->thread.flags |= IA64_THREAD_DBG_VALID; + memset(child->thread.dbr, 0, + sizeof(child->thread.dbr)); + memset(child->thread.ibr, 0, + sizeof(child->thread.ibr)); + } + + ptr += regnum; + + if ((regnum & 1) && write_access) { + /* don't let the user set kernel-level breakpoints: */ + *ptr = *data & ~(7UL << 56); + return 0; + } + } + if (write_access) + *ptr = *data; + else + *data = *ptr; + return 0; +} + +static long +ptrace_getregs (struct task_struct *child, struct pt_all_user_regs __user *ppr) +{ + unsigned long psr, ec, lc, rnat, bsp, cfm, nat_bits, val; + struct unw_frame_info info; + struct ia64_fpreg fpval; + struct switch_stack *sw; + struct pt_regs *pt; + long ret, retval = 0; + char nat = 0; + int i; + + if (!access_ok(VERIFY_WRITE, ppr, sizeof(struct pt_all_user_regs))) + return -EIO; + + pt = ia64_task_regs(child); + sw = (struct switch_stack *) (child->thread.ksp + 16); + unw_init_from_blocked_task(&info, child); + if (unw_unwind_to_user(&info) < 0) { + return -EIO; + } + + if (((unsigned long) ppr & 0x7) != 0) { + dprintk("ptrace:unaligned register address %p\n", ppr); + return -EIO; + } + + if (access_uarea(child, PT_CR_IPSR, &psr, 0) < 0 + || access_uarea(child, PT_AR_EC, &ec, 0) < 0 + || access_uarea(child, PT_AR_LC, &lc, 0) < 0 + || access_uarea(child, PT_AR_RNAT, &rnat, 0) < 0 + || access_uarea(child, PT_AR_BSP, &bsp, 0) < 0 + || access_uarea(child, PT_CFM, &cfm, 0) + || access_uarea(child, PT_NAT_BITS, &nat_bits, 0)) + return -EIO; + + /* control regs */ + + retval |= __put_user(pt->cr_iip, &ppr->cr_iip); + retval |= __put_user(psr, &ppr->cr_ipsr); + + /* app regs */ + + retval |= __put_user(pt->ar_pfs, &ppr->ar[PT_AUR_PFS]); + retval |= __put_user(pt->ar_rsc, &ppr->ar[PT_AUR_RSC]); + retval |= __put_user(pt->ar_bspstore, &ppr->ar[PT_AUR_BSPSTORE]); + retval |= __put_user(pt->ar_unat, &ppr->ar[PT_AUR_UNAT]); + retval |= __put_user(pt->ar_ccv, &ppr->ar[PT_AUR_CCV]); + retval |= __put_user(pt->ar_fpsr, &ppr->ar[PT_AUR_FPSR]); + + retval |= __put_user(ec, &ppr->ar[PT_AUR_EC]); + retval |= __put_user(lc, &ppr->ar[PT_AUR_LC]); + retval |= __put_user(rnat, &ppr->ar[PT_AUR_RNAT]); + retval |= __put_user(bsp, &ppr->ar[PT_AUR_BSP]); + retval |= __put_user(cfm, &ppr->cfm); + + /* gr1-gr3 */ + + retval |= __copy_to_user(&ppr->gr[1], &pt->r1, sizeof(long)); + retval |= __copy_to_user(&ppr->gr[2], &pt->r2, sizeof(long) *2); + + /* gr4-gr7 */ + + for (i = 4; i < 8; i++) { + if (unw_access_gr(&info, i, &val, &nat, 0) < 0) + return -EIO; + retval |= __put_user(val, &ppr->gr[i]); + } + + /* gr8-gr11 */ + + retval |= __copy_to_user(&ppr->gr[8], &pt->r8, sizeof(long) * 4); + + /* gr12-gr15 */ + + retval |= __copy_to_user(&ppr->gr[12], &pt->r12, sizeof(long) * 2); + retval |= __copy_to_user(&ppr->gr[14], &pt->r14, sizeof(long)); + retval |= __copy_to_user(&ppr->gr[15], &pt->r15, sizeof(long)); + + /* gr16-gr31 */ + + retval |= __copy_to_user(&ppr->gr[16], &pt->r16, sizeof(long) * 16); + + /* b0 */ + + retval |= __put_user(pt->b0, &ppr->br[0]); + + /* b1-b5 */ + + for (i = 1; i < 6; i++) { + if (unw_access_br(&info, i, &val, 0) < 0) + return -EIO; + __put_user(val, &ppr->br[i]); + } + + /* b6-b7 */ + + retval |= __put_user(pt->b6, &ppr->br[6]); + retval |= __put_user(pt->b7, &ppr->br[7]); + + /* fr2-fr5 */ + + for (i = 2; i < 6; i++) { + if (unw_get_fr(&info, i, &fpval) < 0) + return -EIO; + retval |= __copy_to_user(&ppr->fr[i], &fpval, sizeof (fpval)); + } + + /* fr6-fr11 */ + + retval |= __copy_to_user(&ppr->fr[6], &pt->f6, + sizeof(struct ia64_fpreg) * 6); + + /* fp scratch regs(12-15) */ + + retval |= __copy_to_user(&ppr->fr[12], &sw->f12, + sizeof(struct ia64_fpreg) * 4); + + /* fr16-fr31 */ + + for (i = 16; i < 32; i++) { + if (unw_get_fr(&info, i, &fpval) < 0) + return -EIO; + retval |= __copy_to_user(&ppr->fr[i], &fpval, sizeof (fpval)); + } + + /* fph */ + + ia64_flush_fph(child); + retval |= __copy_to_user(&ppr->fr[32], &child->thread.fph, + sizeof(ppr->fr[32]) * 96); + + /* preds */ + + retval |= __put_user(pt->pr, &ppr->pr); + + /* nat bits */ + + retval |= __put_user(nat_bits, &ppr->nat); + + ret = retval ? -EIO : 0; + return ret; +} + +static long +ptrace_setregs (struct task_struct *child, struct pt_all_user_regs __user *ppr) +{ + unsigned long psr, ec, lc, rnat, bsp, cfm, nat_bits, val = 0; + struct unw_frame_info info; + struct switch_stack *sw; + struct ia64_fpreg fpval; + struct pt_regs *pt; + long ret, retval = 0; + int i; + + memset(&fpval, 0, sizeof(fpval)); + + if (!access_ok(VERIFY_READ, ppr, sizeof(struct pt_all_user_regs))) + return -EIO; + + pt = ia64_task_regs(child); + sw = (struct switch_stack *) (child->thread.ksp + 16); + unw_init_from_blocked_task(&info, child); + if (unw_unwind_to_user(&info) < 0) { + return -EIO; + } + + if (((unsigned long) ppr & 0x7) != 0) { + dprintk("ptrace:unaligned register address %p\n", ppr); + return -EIO; + } + + /* control regs */ + + retval |= __get_user(pt->cr_iip, &ppr->cr_iip); + retval |= __get_user(psr, &ppr->cr_ipsr); + + /* app regs */ + + retval |= __get_user(pt->ar_pfs, &ppr->ar[PT_AUR_PFS]); + retval |= __get_user(pt->ar_rsc, &ppr->ar[PT_AUR_RSC]); + retval |= __get_user(pt->ar_bspstore, &ppr->ar[PT_AUR_BSPSTORE]); + retval |= __get_user(pt->ar_unat, &ppr->ar[PT_AUR_UNAT]); + retval |= __get_user(pt->ar_ccv, &ppr->ar[PT_AUR_CCV]); + retval |= __get_user(pt->ar_fpsr, &ppr->ar[PT_AUR_FPSR]); + + retval |= __get_user(ec, &ppr->ar[PT_AUR_EC]); + retval |= __get_user(lc, &ppr->ar[PT_AUR_LC]); + retval |= __get_user(rnat, &ppr->ar[PT_AUR_RNAT]); + retval |= __get_user(bsp, &ppr->ar[PT_AUR_BSP]); + retval |= __get_user(cfm, &ppr->cfm); + + /* gr1-gr3 */ + + retval |= __copy_from_user(&pt->r1, &ppr->gr[1], sizeof(long)); + retval |= __copy_from_user(&pt->r2, &ppr->gr[2], sizeof(long) * 2); + + /* gr4-gr7 */ + + for (i = 4; i < 8; i++) { + retval |= __get_user(val, &ppr->gr[i]); + /* NaT bit will be set via PT_NAT_BITS: */ + if (unw_set_gr(&info, i, val, 0) < 0) + return -EIO; + } + + /* gr8-gr11 */ + + retval |= __copy_from_user(&pt->r8, &ppr->gr[8], sizeof(long) * 4); + + /* gr12-gr15 */ + + retval |= __copy_from_user(&pt->r12, &ppr->gr[12], sizeof(long) * 2); + retval |= __copy_from_user(&pt->r14, &ppr->gr[14], sizeof(long)); + retval |= __copy_from_user(&pt->r15, &ppr->gr[15], sizeof(long)); + + /* gr16-gr31 */ + + retval |= __copy_from_user(&pt->r16, &ppr->gr[16], sizeof(long) * 16); + + /* b0 */ + + retval |= __get_user(pt->b0, &ppr->br[0]); + + /* b1-b5 */ + + for (i = 1; i < 6; i++) { + retval |= __get_user(val, &ppr->br[i]); + unw_set_br(&info, i, val); + } + + /* b6-b7 */ + + retval |= __get_user(pt->b6, &ppr->br[6]); + retval |= __get_user(pt->b7, &ppr->br[7]); + + /* fr2-fr5 */ + + for (i = 2; i < 6; i++) { + retval |= __copy_from_user(&fpval, &ppr->fr[i], sizeof(fpval)); + if (unw_set_fr(&info, i, fpval) < 0) + return -EIO; + } + + /* fr6-fr11 */ + + retval |= __copy_from_user(&pt->f6, &ppr->fr[6], + sizeof(ppr->fr[6]) * 6); + + /* fp scratch regs(12-15) */ + + retval |= __copy_from_user(&sw->f12, &ppr->fr[12], + sizeof(ppr->fr[12]) * 4); + + /* fr16-fr31 */ + + for (i = 16; i < 32; i++) { + retval |= __copy_from_user(&fpval, &ppr->fr[i], + sizeof(fpval)); + if (unw_set_fr(&info, i, fpval) < 0) + return -EIO; + } + + /* fph */ + + ia64_sync_fph(child); + retval |= __copy_from_user(&child->thread.fph, &ppr->fr[32], + sizeof(ppr->fr[32]) * 96); + + /* preds */ + + retval |= __get_user(pt->pr, &ppr->pr); + + /* nat bits */ + + retval |= __get_user(nat_bits, &ppr->nat); + + retval |= access_uarea(child, PT_CR_IPSR, &psr, 1); + retval |= access_uarea(child, PT_AR_EC, &ec, 1); + retval |= access_uarea(child, PT_AR_LC, &lc, 1); + retval |= access_uarea(child, PT_AR_RNAT, &rnat, 1); + retval |= access_uarea(child, PT_AR_BSP, &bsp, 1); + retval |= access_uarea(child, PT_CFM, &cfm, 1); + retval |= access_uarea(child, PT_NAT_BITS, &nat_bits, 1); + + ret = retval ? -EIO : 0; + return ret; +} + +/* + * Called by kernel/ptrace.c when detaching.. + * + * Make sure the single step bit is not set. + */ +void +ptrace_disable (struct task_struct *child) +{ + struct ia64_psr *child_psr = ia64_psr(ia64_task_regs(child)); + + /* make sure the single step/taken-branch trap bits are not set: */ + child_psr->ss = 0; + child_psr->tb = 0; +} + +asmlinkage long +sys_ptrace (long request, pid_t pid, unsigned long addr, unsigned long data) +{ + struct pt_regs *pt; + unsigned long urbs_end, peek_or_poke; + struct task_struct *child; + struct switch_stack *sw; + long ret; + + lock_kernel(); + ret = -EPERM; + if (request == PTRACE_TRACEME) { + /* are we already being traced? */ + if (current->ptrace & PT_PTRACED) + goto out; + ret = security_ptrace(current->parent, current); + if (ret) + goto out; + current->ptrace |= PT_PTRACED; + ret = 0; + goto out; + } + + peek_or_poke = (request == PTRACE_PEEKTEXT + || request == PTRACE_PEEKDATA + || request == PTRACE_POKETEXT + || request == PTRACE_POKEDATA); + ret = -ESRCH; + read_lock(&tasklist_lock); + { + child = find_task_by_pid(pid); + if (child) { + if (peek_or_poke) + child = find_thread_for_addr(child, addr); + get_task_struct(child); + } + } + read_unlock(&tasklist_lock); + if (!child) + goto out; + ret = -EPERM; + if (pid == 1) /* no messing around with init! */ + goto out_tsk; + + if (request == PTRACE_ATTACH) { + ret = ptrace_attach(child); + goto out_tsk; + } + + ret = ptrace_check_attach(child, request == PTRACE_KILL); + if (ret < 0) + goto out_tsk; + + pt = ia64_task_regs(child); + sw = (struct switch_stack *) (child->thread.ksp + 16); + + switch (request) { + case PTRACE_PEEKTEXT: + case PTRACE_PEEKDATA: + /* read word at location addr */ + urbs_end = ia64_get_user_rbs_end(child, pt, NULL); + ret = ia64_peek(child, sw, urbs_end, addr, &data); + if (ret == 0) { + ret = data; + /* ensure "ret" is not mistaken as an error code: */ + force_successful_syscall_return(); + } + goto out_tsk; + + case PTRACE_POKETEXT: + case PTRACE_POKEDATA: + /* write the word at location addr */ + urbs_end = ia64_get_user_rbs_end(child, pt, NULL); + ret = ia64_poke(child, sw, urbs_end, addr, data); + goto out_tsk; + + case PTRACE_PEEKUSR: + /* read the word at addr in the USER area */ + if (access_uarea(child, addr, &data, 0) < 0) { + ret = -EIO; + goto out_tsk; + } + ret = data; + /* ensure "ret" is not mistaken as an error code */ + force_successful_syscall_return(); + goto out_tsk; + + case PTRACE_POKEUSR: + /* write the word at addr in the USER area */ + if (access_uarea(child, addr, &data, 1) < 0) { + ret = -EIO; + goto out_tsk; + } + ret = 0; + goto out_tsk; + + case PTRACE_OLD_GETSIGINFO: + /* for backwards-compatibility */ + ret = ptrace_request(child, PTRACE_GETSIGINFO, addr, data); + goto out_tsk; + + case PTRACE_OLD_SETSIGINFO: + /* for backwards-compatibility */ + ret = ptrace_request(child, PTRACE_SETSIGINFO, addr, data); + goto out_tsk; + + case PTRACE_SYSCALL: + /* continue and stop at next (return from) syscall */ + case PTRACE_CONT: + /* restart after signal. */ + ret = -EIO; + if (data > _NSIG) + goto out_tsk; + if (request == PTRACE_SYSCALL) + set_tsk_thread_flag(child, TIF_SYSCALL_TRACE); + else + clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); + child->exit_code = data; + + /* + * Make sure the single step/taken-branch trap bits + * are not set: + */ + ia64_psr(pt)->ss = 0; + ia64_psr(pt)->tb = 0; + + wake_up_process(child); + ret = 0; + goto out_tsk; + + case PTRACE_KILL: + /* + * Make the child exit. Best I can do is send it a + * sigkill. Perhaps it should be put in the status + * that it wants to exit. + */ + if (child->exit_state == EXIT_ZOMBIE) + /* already dead */ + goto out_tsk; + child->exit_code = SIGKILL; + + ptrace_disable(child); + wake_up_process(child); + ret = 0; + goto out_tsk; + + case PTRACE_SINGLESTEP: + /* let child execute for one instruction */ + case PTRACE_SINGLEBLOCK: + ret = -EIO; + if (data > _NSIG) + goto out_tsk; + + clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); + if (request == PTRACE_SINGLESTEP) { + ia64_psr(pt)->ss = 1; + } else { + ia64_psr(pt)->tb = 1; + } + child->exit_code = data; + + /* give it a chance to run. */ + wake_up_process(child); + ret = 0; + goto out_tsk; + + case PTRACE_DETACH: + /* detach a process that was attached. */ + ret = ptrace_detach(child, data); + goto out_tsk; + + case PTRACE_GETREGS: + ret = ptrace_getregs(child, + (struct pt_all_user_regs __user *) data); + goto out_tsk; + + case PTRACE_SETREGS: + ret = ptrace_setregs(child, + (struct pt_all_user_regs __user *) data); + goto out_tsk; + + default: + ret = ptrace_request(child, request, addr, data); + goto out_tsk; + } + out_tsk: + put_task_struct(child); + out: + unlock_kernel(); + return ret; +} + + +void +syscall_trace (void) +{ + if (!test_thread_flag(TIF_SYSCALL_TRACE)) + return; + if (!(current->ptrace & PT_PTRACED)) + return; + /* + * The 0x80 provides a way for the tracing parent to + * distinguish between a syscall stop and SIGTRAP delivery. + */ + ptrace_notify(SIGTRAP + | ((current->ptrace & PT_TRACESYSGOOD) ? 0x80 : 0)); + + /* + * This isn't the same as continuing with a signal, but it + * will do for normal use. strace only continues with a + * signal if the stopping signal is not SIGTRAP. -brl + */ + if (current->exit_code) { + send_sig(current->exit_code, current, 1); + current->exit_code = 0; + } +} + +/* "asmlinkage" so the input arguments are preserved... */ + +asmlinkage void +syscall_trace_enter (long arg0, long arg1, long arg2, long arg3, + long arg4, long arg5, long arg6, long arg7, + struct pt_regs regs) +{ + long syscall; + + if (unlikely(current->audit_context)) { + if (IS_IA32_PROCESS(®s)) + syscall = regs.r1; + else + syscall = regs.r15; + + audit_syscall_entry(current, syscall, arg0, arg1, arg2, arg3); + } + + if (test_thread_flag(TIF_SYSCALL_TRACE) + && (current->ptrace & PT_PTRACED)) + syscall_trace(); +} + +/* "asmlinkage" so the input arguments are preserved... */ + +asmlinkage void +syscall_trace_leave (long arg0, long arg1, long arg2, long arg3, + long arg4, long arg5, long arg6, long arg7, + struct pt_regs regs) +{ + if (unlikely(current->audit_context)) + audit_syscall_exit(current, regs.r8); + + if (test_thread_flag(TIF_SYSCALL_TRACE) + && (current->ptrace & PT_PTRACED)) + syscall_trace(); +} diff --git a/arch/ia64/kernel/sal.c b/arch/ia64/kernel/sal.c new file mode 100644 index 000000000000..acc0f132f86c --- /dev/null +++ b/arch/ia64/kernel/sal.c @@ -0,0 +1,302 @@ +/* + * System Abstraction Layer (SAL) interface routines. + * + * Copyright (C) 1998, 1999, 2001, 2003 Hewlett-Packard Co + * David Mosberger-Tang <davidm@hpl.hp.com> + * Copyright (C) 1999 VA Linux Systems + * Copyright (C) 1999 Walt Drummond <drummond@valinux.com> + */ +#include <linux/config.h> + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/spinlock.h> +#include <linux/string.h> + +#include <asm/page.h> +#include <asm/sal.h> +#include <asm/pal.h> + + __cacheline_aligned DEFINE_SPINLOCK(sal_lock); +unsigned long sal_platform_features; + +unsigned short sal_revision; +unsigned short sal_version; + +#define SAL_MAJOR(x) ((x) >> 8) +#define SAL_MINOR(x) ((x) & 0xff) + +static struct { + void *addr; /* function entry point */ + void *gpval; /* gp value to use */ +} pdesc; + +static long +default_handler (void) +{ + return -1; +} + +ia64_sal_handler ia64_sal = (ia64_sal_handler) default_handler; +ia64_sal_desc_ptc_t *ia64_ptc_domain_info; + +const char * +ia64_sal_strerror (long status) +{ + const char *str; + switch (status) { + case 0: str = "Call completed without error"; break; + case 1: str = "Effect a warm boot of the system to complete " + "the update"; break; + case -1: str = "Not implemented"; break; + case -2: str = "Invalid argument"; break; + case -3: str = "Call completed with error"; break; + case -4: str = "Virtual address not registered"; break; + case -5: str = "No information available"; break; + case -6: str = "Insufficient space to add the entry"; break; + case -7: str = "Invalid entry_addr value"; break; + case -8: str = "Invalid interrupt vector"; break; + case -9: str = "Requested memory not available"; break; + case -10: str = "Unable to write to the NVM device"; break; + case -11: str = "Invalid partition type specified"; break; + case -12: str = "Invalid NVM_Object id specified"; break; + case -13: str = "NVM_Object already has the maximum number " + "of partitions"; break; + case -14: str = "Insufficient space in partition for the " + "requested write sub-function"; break; + case -15: str = "Insufficient data buffer space for the " + "requested read record sub-function"; break; + case -16: str = "Scratch buffer required for the write/delete " + "sub-function"; break; + case -17: str = "Insufficient space in the NVM_Object for the " + "requested create sub-function"; break; + case -18: str = "Invalid value specified in the partition_rec " + "argument"; break; + case -19: str = "Record oriented I/O not supported for this " + "partition"; break; + case -20: str = "Bad format of record to be written or " + "required keyword variable not " + "specified"; break; + default: str = "Unknown SAL status code"; break; + } + return str; +} + +void __init +ia64_sal_handler_init (void *entry_point, void *gpval) +{ + /* fill in the SAL procedure descriptor and point ia64_sal to it: */ + pdesc.addr = entry_point; + pdesc.gpval = gpval; + ia64_sal = (ia64_sal_handler) &pdesc; +} + +static void __init +check_versions (struct ia64_sal_systab *systab) +{ + sal_revision = (systab->sal_rev_major << 8) | systab->sal_rev_minor; + sal_version = (systab->sal_b_rev_major << 8) | systab->sal_b_rev_minor; + + /* Check for broken firmware */ + if ((sal_revision == SAL_VERSION_CODE(49, 29)) + && (sal_version == SAL_VERSION_CODE(49, 29))) + { + /* + * Old firmware for zx2000 prototypes have this weird version number, + * reset it to something sane. + */ + sal_revision = SAL_VERSION_CODE(2, 8); + sal_version = SAL_VERSION_CODE(0, 0); + } +} + +static void __init +sal_desc_entry_point (void *p) +{ + struct ia64_sal_desc_entry_point *ep = p; + ia64_pal_handler_init(__va(ep->pal_proc)); + ia64_sal_handler_init(__va(ep->sal_proc), __va(ep->gp)); +} + +#ifdef CONFIG_SMP +static void __init +set_smp_redirect (int flag) +{ +#ifndef CONFIG_HOTPLUG_CPU + if (no_int_routing) + smp_int_redirect &= ~flag; + else + smp_int_redirect |= flag; +#else + /* + * For CPU Hotplug we dont want to do any chipset supported + * interrupt redirection. The reason is this would require that + * All interrupts be stopped and hard bind the irq to a cpu. + * Later when the interrupt is fired we need to set the redir hint + * on again in the vector. This is combersome for something that the + * user mode irq balancer will solve anyways. + */ + no_int_routing=1; + smp_int_redirect &= ~flag; +#endif +} +#else +#define set_smp_redirect(flag) do { } while (0) +#endif + +static void __init +sal_desc_platform_feature (void *p) +{ + struct ia64_sal_desc_platform_feature *pf = p; + sal_platform_features = pf->feature_mask; + + printk(KERN_INFO "SAL Platform features:"); + if (!sal_platform_features) { + printk(" None\n"); + return; + } + + if (sal_platform_features & IA64_SAL_PLATFORM_FEATURE_BUS_LOCK) + printk(" BusLock"); + if (sal_platform_features & IA64_SAL_PLATFORM_FEATURE_IRQ_REDIR_HINT) { + printk(" IRQ_Redirection"); + set_smp_redirect(SMP_IRQ_REDIRECTION); + } + if (sal_platform_features & IA64_SAL_PLATFORM_FEATURE_IPI_REDIR_HINT) { + printk(" IPI_Redirection"); + set_smp_redirect(SMP_IPI_REDIRECTION); + } + if (sal_platform_features & IA64_SAL_PLATFORM_FEATURE_ITC_DRIFT) + printk(" ITC_Drift"); + printk("\n"); +} + +#ifdef CONFIG_SMP +static void __init +sal_desc_ap_wakeup (void *p) +{ + struct ia64_sal_desc_ap_wakeup *ap = p; + + switch (ap->mechanism) { + case IA64_SAL_AP_EXTERNAL_INT: + ap_wakeup_vector = ap->vector; + printk(KERN_INFO "SAL: AP wakeup using external interrupt " + "vector 0x%lx\n", ap_wakeup_vector); + break; + default: + printk(KERN_ERR "SAL: AP wakeup mechanism unsupported!\n"); + break; + } +} + +static void __init +chk_nointroute_opt(void) +{ + char *cp; + extern char saved_command_line[]; + + for (cp = saved_command_line; *cp; ) { + if (memcmp(cp, "nointroute", 10) == 0) { + no_int_routing = 1; + printk ("no_int_routing on\n"); + break; + } else { + while (*cp != ' ' && *cp) + ++cp; + while (*cp == ' ') + ++cp; + } + } +} + +#else +static void __init sal_desc_ap_wakeup(void *p) { } +#endif + +void __init +ia64_sal_init (struct ia64_sal_systab *systab) +{ + char *p; + int i; + + if (!systab) { + printk(KERN_WARNING "Hmm, no SAL System Table.\n"); + return; + } + + if (strncmp(systab->signature, "SST_", 4) != 0) + printk(KERN_ERR "bad signature in system table!"); + + check_versions(systab); +#ifdef CONFIG_SMP + chk_nointroute_opt(); +#endif + + /* revisions are coded in BCD, so %x does the job for us */ + printk(KERN_INFO "SAL %x.%x: %.32s %.32s%sversion %x.%x\n", + SAL_MAJOR(sal_revision), SAL_MINOR(sal_revision), + systab->oem_id, systab->product_id, + systab->product_id[0] ? " " : "", + SAL_MAJOR(sal_version), SAL_MINOR(sal_version)); + + p = (char *) (systab + 1); + for (i = 0; i < systab->entry_count; i++) { + /* + * The first byte of each entry type contains the type + * descriptor. + */ + switch (*p) { + case SAL_DESC_ENTRY_POINT: + sal_desc_entry_point(p); + break; + case SAL_DESC_PLATFORM_FEATURE: + sal_desc_platform_feature(p); + break; + case SAL_DESC_PTC: + ia64_ptc_domain_info = (ia64_sal_desc_ptc_t *)p; + break; + case SAL_DESC_AP_WAKEUP: + sal_desc_ap_wakeup(p); + break; + } + p += SAL_DESC_SIZE(*p); + } +} + +int +ia64_sal_oemcall(struct ia64_sal_retval *isrvp, u64 oemfunc, u64 arg1, + u64 arg2, u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7) +{ + if (oemfunc < IA64_SAL_OEMFUNC_MIN || oemfunc > IA64_SAL_OEMFUNC_MAX) + return -1; + SAL_CALL(*isrvp, oemfunc, arg1, arg2, arg3, arg4, arg5, arg6, arg7); + return 0; +} +EXPORT_SYMBOL(ia64_sal_oemcall); + +int +ia64_sal_oemcall_nolock(struct ia64_sal_retval *isrvp, u64 oemfunc, u64 arg1, + u64 arg2, u64 arg3, u64 arg4, u64 arg5, u64 arg6, + u64 arg7) +{ + if (oemfunc < IA64_SAL_OEMFUNC_MIN || oemfunc > IA64_SAL_OEMFUNC_MAX) + return -1; + SAL_CALL_NOLOCK(*isrvp, oemfunc, arg1, arg2, arg3, arg4, arg5, arg6, + arg7); + return 0; +} +EXPORT_SYMBOL(ia64_sal_oemcall_nolock); + +int +ia64_sal_oemcall_reentrant(struct ia64_sal_retval *isrvp, u64 oemfunc, + u64 arg1, u64 arg2, u64 arg3, u64 arg4, u64 arg5, + u64 arg6, u64 arg7) +{ + if (oemfunc < IA64_SAL_OEMFUNC_MIN || oemfunc > IA64_SAL_OEMFUNC_MAX) + return -1; + SAL_CALL_REENTRANT(*isrvp, oemfunc, arg1, arg2, arg3, arg4, arg5, arg6, + arg7); + return 0; +} +EXPORT_SYMBOL(ia64_sal_oemcall_reentrant); diff --git a/arch/ia64/kernel/salinfo.c b/arch/ia64/kernel/salinfo.c new file mode 100644 index 000000000000..d227fabecd02 --- /dev/null +++ b/arch/ia64/kernel/salinfo.c @@ -0,0 +1,629 @@ +/* + * salinfo.c + * + * Creates entries in /proc/sal for various system features. + * + * Copyright (c) 2003 Silicon Graphics, Inc. All rights reserved. + * Copyright (c) 2003 Hewlett-Packard Co + * Bjorn Helgaas <bjorn.helgaas@hp.com> + * + * 10/30/2001 jbarnes@sgi.com copied much of Stephane's palinfo + * code to create this file + * Oct 23 2003 kaos@sgi.com + * Replace IPI with set_cpus_allowed() to read a record from the required cpu. + * Redesign salinfo log processing to separate interrupt and user space + * contexts. + * Cache the record across multi-block reads from user space. + * Support > 64 cpus. + * Delete module_exit and MOD_INC/DEC_COUNT, salinfo cannot be a module. + * + * Jan 28 2004 kaos@sgi.com + * Periodically check for outstanding MCA or INIT records. + * + * Dec 5 2004 kaos@sgi.com + * Standardize which records are cleared automatically. + */ + +#include <linux/types.h> +#include <linux/proc_fs.h> +#include <linux/module.h> +#include <linux/smp.h> +#include <linux/smp_lock.h> +#include <linux/timer.h> +#include <linux/vmalloc.h> + +#include <asm/semaphore.h> +#include <asm/sal.h> +#include <asm/uaccess.h> + +MODULE_AUTHOR("Jesse Barnes <jbarnes@sgi.com>"); +MODULE_DESCRIPTION("/proc interface to IA-64 SAL features"); +MODULE_LICENSE("GPL"); + +static int salinfo_read(char *page, char **start, off_t off, int count, int *eof, void *data); + +typedef struct { + const char *name; /* name of the proc entry */ + unsigned long feature; /* feature bit */ + struct proc_dir_entry *entry; /* registered entry (removal) */ +} salinfo_entry_t; + +/* + * List {name,feature} pairs for every entry in /proc/sal/<feature> + * that this module exports + */ +static salinfo_entry_t salinfo_entries[]={ + { "bus_lock", IA64_SAL_PLATFORM_FEATURE_BUS_LOCK, }, + { "irq_redirection", IA64_SAL_PLATFORM_FEATURE_IRQ_REDIR_HINT, }, + { "ipi_redirection", IA64_SAL_PLATFORM_FEATURE_IPI_REDIR_HINT, }, + { "itc_drift", IA64_SAL_PLATFORM_FEATURE_ITC_DRIFT, }, +}; + +#define NR_SALINFO_ENTRIES ARRAY_SIZE(salinfo_entries) + +static char *salinfo_log_name[] = { + "mca", + "init", + "cmc", + "cpe", +}; + +static struct proc_dir_entry *salinfo_proc_entries[ + ARRAY_SIZE(salinfo_entries) + /* /proc/sal/bus_lock */ + ARRAY_SIZE(salinfo_log_name) + /* /proc/sal/{mca,...} */ + (2 * ARRAY_SIZE(salinfo_log_name)) + /* /proc/sal/mca/{event,data} */ + 1]; /* /proc/sal */ + +/* Some records we get ourselves, some are accessed as saved data in buffers + * that are owned by mca.c. + */ +struct salinfo_data_saved { + u8* buffer; + u64 size; + u64 id; + int cpu; +}; + +/* State transitions. Actions are :- + * Write "read <cpunum>" to the data file. + * Write "clear <cpunum>" to the data file. + * Write "oemdata <cpunum> <offset> to the data file. + * Read from the data file. + * Close the data file. + * + * Start state is NO_DATA. + * + * NO_DATA + * write "read <cpunum>" -> NO_DATA or LOG_RECORD. + * write "clear <cpunum>" -> NO_DATA or LOG_RECORD. + * write "oemdata <cpunum> <offset> -> return -EINVAL. + * read data -> return EOF. + * close -> unchanged. Free record areas. + * + * LOG_RECORD + * write "read <cpunum>" -> NO_DATA or LOG_RECORD. + * write "clear <cpunum>" -> NO_DATA or LOG_RECORD. + * write "oemdata <cpunum> <offset> -> format the oem data, goto OEMDATA. + * read data -> return the INIT/MCA/CMC/CPE record. + * close -> unchanged. Keep record areas. + * + * OEMDATA + * write "read <cpunum>" -> NO_DATA or LOG_RECORD. + * write "clear <cpunum>" -> NO_DATA or LOG_RECORD. + * write "oemdata <cpunum> <offset> -> format the oem data, goto OEMDATA. + * read data -> return the formatted oemdata. + * close -> unchanged. Keep record areas. + * + * Closing the data file does not change the state. This allows shell scripts + * to manipulate salinfo data, each shell redirection opens the file, does one + * action then closes it again. The record areas are only freed at close when + * the state is NO_DATA. + */ +enum salinfo_state { + STATE_NO_DATA, + STATE_LOG_RECORD, + STATE_OEMDATA, +}; + +struct salinfo_data { + volatile cpumask_t cpu_event; /* which cpus have outstanding events */ + struct semaphore sem; /* count of cpus with outstanding events (bits set in cpu_event) */ + u8 *log_buffer; + u64 log_size; + u8 *oemdata; /* decoded oem data */ + u64 oemdata_size; + int open; /* single-open to prevent races */ + u8 type; + u8 saved_num; /* using a saved record? */ + enum salinfo_state state :8; /* processing state */ + u8 padding; + int cpu_check; /* next CPU to check */ + struct salinfo_data_saved data_saved[5];/* save last 5 records from mca.c, must be < 255 */ +}; + +static struct salinfo_data salinfo_data[ARRAY_SIZE(salinfo_log_name)]; + +static spinlock_t data_lock, data_saved_lock; + +/** salinfo_platform_oemdata - optional callback to decode oemdata from an error + * record. + * @sect_header: pointer to the start of the section to decode. + * @oemdata: returns vmalloc area containing the decded output. + * @oemdata_size: returns length of decoded output (strlen). + * + * Description: If user space asks for oem data to be decoded by the kernel + * and/or prom and the platform has set salinfo_platform_oemdata to the address + * of a platform specific routine then call that routine. salinfo_platform_oemdata + * vmalloc's and formats its output area, returning the address of the text + * and its strlen. Returns 0 for success, -ve for error. The callback is + * invoked on the cpu that generated the error record. + */ +int (*salinfo_platform_oemdata)(const u8 *sect_header, u8 **oemdata, u64 *oemdata_size); + +struct salinfo_platform_oemdata_parms { + const u8 *efi_guid; + u8 **oemdata; + u64 *oemdata_size; + int ret; +}; + +static void +salinfo_platform_oemdata_cpu(void *context) +{ + struct salinfo_platform_oemdata_parms *parms = context; + parms->ret = salinfo_platform_oemdata(parms->efi_guid, parms->oemdata, parms->oemdata_size); +} + +static void +shift1_data_saved (struct salinfo_data *data, int shift) +{ + memcpy(data->data_saved+shift, data->data_saved+shift+1, + (ARRAY_SIZE(data->data_saved) - (shift+1)) * sizeof(data->data_saved[0])); + memset(data->data_saved + ARRAY_SIZE(data->data_saved) - 1, 0, + sizeof(data->data_saved[0])); +} + +/* This routine is invoked in interrupt context. Note: mca.c enables + * interrupts before calling this code for CMC/CPE. MCA and INIT events are + * not irq safe, do not call any routines that use spinlocks, they may deadlock. + * MCA and INIT records are recorded, a timer event will look for any + * outstanding events and wake up the user space code. + * + * The buffer passed from mca.c points to the output from ia64_log_get. This is + * a persistent buffer but its contents can change between the interrupt and + * when user space processes the record. Save the record id to identify + * changes. + */ +void +salinfo_log_wakeup(int type, u8 *buffer, u64 size, int irqsafe) +{ + struct salinfo_data *data = salinfo_data + type; + struct salinfo_data_saved *data_saved; + unsigned long flags = 0; + int i; + int saved_size = ARRAY_SIZE(data->data_saved); + + BUG_ON(type >= ARRAY_SIZE(salinfo_log_name)); + + if (irqsafe) + spin_lock_irqsave(&data_saved_lock, flags); + for (i = 0, data_saved = data->data_saved; i < saved_size; ++i, ++data_saved) { + if (!data_saved->buffer) + break; + } + if (i == saved_size) { + if (!data->saved_num) { + shift1_data_saved(data, 0); + data_saved = data->data_saved + saved_size - 1; + } else + data_saved = NULL; + } + if (data_saved) { + data_saved->cpu = smp_processor_id(); + data_saved->id = ((sal_log_record_header_t *)buffer)->id; + data_saved->size = size; + data_saved->buffer = buffer; + } + if (irqsafe) + spin_unlock_irqrestore(&data_saved_lock, flags); + + if (!test_and_set_bit(smp_processor_id(), &data->cpu_event)) { + if (irqsafe) + up(&data->sem); + } +} + +/* Check for outstanding MCA/INIT records every minute (arbitrary) */ +#define SALINFO_TIMER_DELAY (60*HZ) +static struct timer_list salinfo_timer; + +static void +salinfo_timeout_check(struct salinfo_data *data) +{ + int i; + if (!data->open) + return; + for (i = 0; i < NR_CPUS; ++i) { + if (test_bit(i, &data->cpu_event)) { + /* double up() is not a problem, user space will see no + * records for the additional "events". + */ + up(&data->sem); + } + } +} + +static void +salinfo_timeout (unsigned long arg) +{ + salinfo_timeout_check(salinfo_data + SAL_INFO_TYPE_MCA); + salinfo_timeout_check(salinfo_data + SAL_INFO_TYPE_INIT); + salinfo_timer.expires = jiffies + SALINFO_TIMER_DELAY; + add_timer(&salinfo_timer); +} + +static int +salinfo_event_open(struct inode *inode, struct file *file) +{ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + return 0; +} + +static ssize_t +salinfo_event_read(struct file *file, char __user *buffer, size_t count, loff_t *ppos) +{ + struct inode *inode = file->f_dentry->d_inode; + struct proc_dir_entry *entry = PDE(inode); + struct salinfo_data *data = entry->data; + char cmd[32]; + size_t size; + int i, n, cpu = -1; + +retry: + if (down_trylock(&data->sem)) { + if (file->f_flags & O_NONBLOCK) + return -EAGAIN; + if (down_interruptible(&data->sem)) + return -ERESTARTSYS; + } + + n = data->cpu_check; + for (i = 0; i < NR_CPUS; i++) { + if (test_bit(n, &data->cpu_event)) { + cpu = n; + break; + } + if (++n == NR_CPUS) + n = 0; + } + + if (cpu == -1) + goto retry; + + /* events are sticky until the user says "clear" */ + up(&data->sem); + + /* for next read, start checking at next CPU */ + data->cpu_check = cpu; + if (++data->cpu_check == NR_CPUS) + data->cpu_check = 0; + + snprintf(cmd, sizeof(cmd), "read %d\n", cpu); + + size = strlen(cmd); + if (size > count) + size = count; + if (copy_to_user(buffer, cmd, size)) + return -EFAULT; + + return size; +} + +static struct file_operations salinfo_event_fops = { + .open = salinfo_event_open, + .read = salinfo_event_read, +}; + +static int +salinfo_log_open(struct inode *inode, struct file *file) +{ + struct proc_dir_entry *entry = PDE(inode); + struct salinfo_data *data = entry->data; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + spin_lock(&data_lock); + if (data->open) { + spin_unlock(&data_lock); + return -EBUSY; + } + data->open = 1; + spin_unlock(&data_lock); + + if (data->state == STATE_NO_DATA && + !(data->log_buffer = vmalloc(ia64_sal_get_state_info_size(data->type)))) { + data->open = 0; + return -ENOMEM; + } + + return 0; +} + +static int +salinfo_log_release(struct inode *inode, struct file *file) +{ + struct proc_dir_entry *entry = PDE(inode); + struct salinfo_data *data = entry->data; + + if (data->state == STATE_NO_DATA) { + vfree(data->log_buffer); + vfree(data->oemdata); + data->log_buffer = NULL; + data->oemdata = NULL; + } + spin_lock(&data_lock); + data->open = 0; + spin_unlock(&data_lock); + return 0; +} + +static void +call_on_cpu(int cpu, void (*fn)(void *), void *arg) +{ + cpumask_t save_cpus_allowed, new_cpus_allowed; + memcpy(&save_cpus_allowed, ¤t->cpus_allowed, sizeof(save_cpus_allowed)); + memset(&new_cpus_allowed, 0, sizeof(new_cpus_allowed)); + set_bit(cpu, &new_cpus_allowed); + set_cpus_allowed(current, new_cpus_allowed); + (*fn)(arg); + set_cpus_allowed(current, save_cpus_allowed); +} + +static void +salinfo_log_read_cpu(void *context) +{ + struct salinfo_data *data = context; + sal_log_record_header_t *rh; + data->log_size = ia64_sal_get_state_info(data->type, (u64 *) data->log_buffer); + rh = (sal_log_record_header_t *)(data->log_buffer); + /* Clear corrected errors as they are read from SAL */ + if (rh->severity == sal_log_severity_corrected) + ia64_sal_clear_state_info(data->type); +} + +static void +salinfo_log_new_read(int cpu, struct salinfo_data *data) +{ + struct salinfo_data_saved *data_saved; + unsigned long flags; + int i; + int saved_size = ARRAY_SIZE(data->data_saved); + + data->saved_num = 0; + spin_lock_irqsave(&data_saved_lock, flags); +retry: + for (i = 0, data_saved = data->data_saved; i < saved_size; ++i, ++data_saved) { + if (data_saved->buffer && data_saved->cpu == cpu) { + sal_log_record_header_t *rh = (sal_log_record_header_t *)(data_saved->buffer); + data->log_size = data_saved->size; + memcpy(data->log_buffer, rh, data->log_size); + barrier(); /* id check must not be moved */ + if (rh->id == data_saved->id) { + data->saved_num = i+1; + break; + } + /* saved record changed by mca.c since interrupt, discard it */ + shift1_data_saved(data, i); + goto retry; + } + } + spin_unlock_irqrestore(&data_saved_lock, flags); + + if (!data->saved_num) + call_on_cpu(cpu, salinfo_log_read_cpu, data); + if (!data->log_size) { + data->state = STATE_NO_DATA; + clear_bit(cpu, &data->cpu_event); + } else { + data->state = STATE_LOG_RECORD; + } +} + +static ssize_t +salinfo_log_read(struct file *file, char __user *buffer, size_t count, loff_t *ppos) +{ + struct inode *inode = file->f_dentry->d_inode; + struct proc_dir_entry *entry = PDE(inode); + struct salinfo_data *data = entry->data; + u8 *buf; + u64 bufsize; + + if (data->state == STATE_LOG_RECORD) { + buf = data->log_buffer; + bufsize = data->log_size; + } else if (data->state == STATE_OEMDATA) { + buf = data->oemdata; + bufsize = data->oemdata_size; + } else { + buf = NULL; + bufsize = 0; + } + return simple_read_from_buffer(buffer, count, ppos, buf, bufsize); +} + +static void +salinfo_log_clear_cpu(void *context) +{ + struct salinfo_data *data = context; + ia64_sal_clear_state_info(data->type); +} + +static int +salinfo_log_clear(struct salinfo_data *data, int cpu) +{ + sal_log_record_header_t *rh; + data->state = STATE_NO_DATA; + if (!test_bit(cpu, &data->cpu_event)) + return 0; + down(&data->sem); + clear_bit(cpu, &data->cpu_event); + if (data->saved_num) { + unsigned long flags; + spin_lock_irqsave(&data_saved_lock, flags); + shift1_data_saved(data, data->saved_num - 1 ); + data->saved_num = 0; + spin_unlock_irqrestore(&data_saved_lock, flags); + } + rh = (sal_log_record_header_t *)(data->log_buffer); + /* Corrected errors have already been cleared from SAL */ + if (rh->severity != sal_log_severity_corrected) + call_on_cpu(cpu, salinfo_log_clear_cpu, data); + /* clearing a record may make a new record visible */ + salinfo_log_new_read(cpu, data); + if (data->state == STATE_LOG_RECORD && + !test_and_set_bit(cpu, &data->cpu_event)) + up(&data->sem); + return 0; +} + +static ssize_t +salinfo_log_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) +{ + struct inode *inode = file->f_dentry->d_inode; + struct proc_dir_entry *entry = PDE(inode); + struct salinfo_data *data = entry->data; + char cmd[32]; + size_t size; + u32 offset; + int cpu; + + size = sizeof(cmd); + if (count < size) + size = count; + if (copy_from_user(cmd, buffer, size)) + return -EFAULT; + + if (sscanf(cmd, "read %d", &cpu) == 1) { + salinfo_log_new_read(cpu, data); + } else if (sscanf(cmd, "clear %d", &cpu) == 1) { + int ret; + if ((ret = salinfo_log_clear(data, cpu))) + count = ret; + } else if (sscanf(cmd, "oemdata %d %d", &cpu, &offset) == 2) { + if (data->state != STATE_LOG_RECORD && data->state != STATE_OEMDATA) + return -EINVAL; + if (offset > data->log_size - sizeof(efi_guid_t)) + return -EINVAL; + data->state = STATE_OEMDATA; + if (salinfo_platform_oemdata) { + struct salinfo_platform_oemdata_parms parms = { + .efi_guid = data->log_buffer + offset, + .oemdata = &data->oemdata, + .oemdata_size = &data->oemdata_size + }; + call_on_cpu(cpu, salinfo_platform_oemdata_cpu, &parms); + if (parms.ret) + count = parms.ret; + } else + data->oemdata_size = 0; + } else + return -EINVAL; + + return count; +} + +static struct file_operations salinfo_data_fops = { + .open = salinfo_log_open, + .release = salinfo_log_release, + .read = salinfo_log_read, + .write = salinfo_log_write, +}; + +static int __init +salinfo_init(void) +{ + struct proc_dir_entry *salinfo_dir; /* /proc/sal dir entry */ + struct proc_dir_entry **sdir = salinfo_proc_entries; /* keeps track of every entry */ + struct proc_dir_entry *dir, *entry; + struct salinfo_data *data; + int i, j, online; + + salinfo_dir = proc_mkdir("sal", NULL); + if (!salinfo_dir) + return 0; + + for (i=0; i < NR_SALINFO_ENTRIES; i++) { + /* pass the feature bit in question as misc data */ + *sdir++ = create_proc_read_entry (salinfo_entries[i].name, 0, salinfo_dir, + salinfo_read, (void *)salinfo_entries[i].feature); + } + + for (i = 0; i < ARRAY_SIZE(salinfo_log_name); i++) { + data = salinfo_data + i; + data->type = i; + sema_init(&data->sem, 0); + dir = proc_mkdir(salinfo_log_name[i], salinfo_dir); + if (!dir) + continue; + + entry = create_proc_entry("event", S_IRUSR, dir); + if (!entry) + continue; + entry->data = data; + entry->proc_fops = &salinfo_event_fops; + *sdir++ = entry; + + entry = create_proc_entry("data", S_IRUSR | S_IWUSR, dir); + if (!entry) + continue; + entry->data = data; + entry->proc_fops = &salinfo_data_fops; + *sdir++ = entry; + + /* we missed any events before now */ + online = 0; + for (j = 0; j < NR_CPUS; j++) + if (cpu_online(j)) { + set_bit(j, &data->cpu_event); + ++online; + } + sema_init(&data->sem, online); + + *sdir++ = dir; + } + + *sdir++ = salinfo_dir; + + init_timer(&salinfo_timer); + salinfo_timer.expires = jiffies + SALINFO_TIMER_DELAY; + salinfo_timer.function = &salinfo_timeout; + add_timer(&salinfo_timer); + + return 0; +} + +/* + * 'data' contains an integer that corresponds to the feature we're + * testing + */ +static int +salinfo_read(char *page, char **start, off_t off, int count, int *eof, void *data) +{ + int len = 0; + + len = sprintf(page, (sal_platform_features & (unsigned long)data) ? "1\n" : "0\n"); + + if (len <= off+count) *eof = 1; + + *start = page + off; + len -= off; + + if (len>count) len = count; + if (len<0) len = 0; + + return len; +} + +module_init(salinfo_init); diff --git a/arch/ia64/kernel/semaphore.c b/arch/ia64/kernel/semaphore.c new file mode 100644 index 000000000000..2724ef3fbae2 --- /dev/null +++ b/arch/ia64/kernel/semaphore.c @@ -0,0 +1,165 @@ +/* + * IA-64 semaphore implementation (derived from x86 version). + * + * Copyright (C) 1999-2000, 2002 Hewlett-Packard Co + * David Mosberger-Tang <davidm@hpl.hp.com> + */ + +/* + * Semaphores are implemented using a two-way counter: The "count" + * variable is decremented for each process that tries to acquire the + * semaphore, while the "sleepers" variable is a count of such + * acquires. + * + * Notably, the inline "up()" and "down()" functions can efficiently + * test if they need to do any extra work (up needs to do something + * only if count was negative before the increment operation. + * + * "sleeping" and the contention routine ordering is protected + * by the spinlock in the semaphore's waitqueue head. + * + * Note that these functions are only called when there is contention + * on the lock, and as such all this is the "non-critical" part of the + * whole semaphore business. The critical part is the inline stuff in + * <asm/semaphore.h> where we want to avoid any extra jumps and calls. + */ +#include <linux/sched.h> +#include <linux/init.h> + +#include <asm/errno.h> +#include <asm/semaphore.h> + +/* + * Logic: + * - Only on a boundary condition do we need to care. When we go + * from a negative count to a non-negative, we wake people up. + * - When we go from a non-negative count to a negative do we + * (a) synchronize with the "sleepers" count and (b) make sure + * that we're on the wakeup list before we synchronize so that + * we cannot lose wakeup events. + */ + +void +__up (struct semaphore *sem) +{ + wake_up(&sem->wait); +} + +void __sched __down (struct semaphore *sem) +{ + struct task_struct *tsk = current; + DECLARE_WAITQUEUE(wait, tsk); + unsigned long flags; + + tsk->state = TASK_UNINTERRUPTIBLE; + spin_lock_irqsave(&sem->wait.lock, flags); + add_wait_queue_exclusive_locked(&sem->wait, &wait); + + sem->sleepers++; + for (;;) { + int sleepers = sem->sleepers; + + /* + * Add "everybody else" into it. They aren't + * playing, because we own the spinlock in + * the wait_queue_head. + */ + if (!atomic_add_negative(sleepers - 1, &sem->count)) { + sem->sleepers = 0; + break; + } + sem->sleepers = 1; /* us - see -1 above */ + spin_unlock_irqrestore(&sem->wait.lock, flags); + + schedule(); + + spin_lock_irqsave(&sem->wait.lock, flags); + tsk->state = TASK_UNINTERRUPTIBLE; + } + remove_wait_queue_locked(&sem->wait, &wait); + wake_up_locked(&sem->wait); + spin_unlock_irqrestore(&sem->wait.lock, flags); + tsk->state = TASK_RUNNING; +} + +int __sched __down_interruptible (struct semaphore * sem) +{ + int retval = 0; + struct task_struct *tsk = current; + DECLARE_WAITQUEUE(wait, tsk); + unsigned long flags; + + tsk->state = TASK_INTERRUPTIBLE; + spin_lock_irqsave(&sem->wait.lock, flags); + add_wait_queue_exclusive_locked(&sem->wait, &wait); + + sem->sleepers ++; + for (;;) { + int sleepers = sem->sleepers; + + /* + * With signals pending, this turns into + * the trylock failure case - we won't be + * sleeping, and we* can't get the lock as + * it has contention. Just correct the count + * and exit. + */ + if (signal_pending(current)) { + retval = -EINTR; + sem->sleepers = 0; + atomic_add(sleepers, &sem->count); + break; + } + + /* + * Add "everybody else" into it. They aren't + * playing, because we own the spinlock in + * wait_queue_head. The "-1" is because we're + * still hoping to get the semaphore. + */ + if (!atomic_add_negative(sleepers - 1, &sem->count)) { + sem->sleepers = 0; + break; + } + sem->sleepers = 1; /* us - see -1 above */ + spin_unlock_irqrestore(&sem->wait.lock, flags); + + schedule(); + + spin_lock_irqsave(&sem->wait.lock, flags); + tsk->state = TASK_INTERRUPTIBLE; + } + remove_wait_queue_locked(&sem->wait, &wait); + wake_up_locked(&sem->wait); + spin_unlock_irqrestore(&sem->wait.lock, flags); + + tsk->state = TASK_RUNNING; + return retval; +} + +/* + * Trylock failed - make sure we correct for having decremented the + * count. + */ +int +__down_trylock (struct semaphore *sem) +{ + unsigned long flags; + int sleepers; + + spin_lock_irqsave(&sem->wait.lock, flags); + sleepers = sem->sleepers + 1; + sem->sleepers = 0; + + /* + * Add "everybody else" and us into it. They aren't + * playing, because we own the spinlock in the + * wait_queue_head. + */ + if (!atomic_add_negative(sleepers, &sem->count)) { + wake_up_locked(&sem->wait); + } + + spin_unlock_irqrestore(&sem->wait.lock, flags); + return 1; +} diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c new file mode 100644 index 000000000000..f05650c801d2 --- /dev/null +++ b/arch/ia64/kernel/setup.c @@ -0,0 +1,723 @@ +/* + * Architecture-specific setup. + * + * Copyright (C) 1998-2001, 2003-2004 Hewlett-Packard Co + * David Mosberger-Tang <davidm@hpl.hp.com> + * Stephane Eranian <eranian@hpl.hp.com> + * Copyright (C) 2000, Rohit Seth <rohit.seth@intel.com> + * Copyright (C) 1999 VA Linux Systems + * Copyright (C) 1999 Walt Drummond <drummond@valinux.com> + * + * 11/12/01 D.Mosberger Convert get_cpuinfo() to seq_file based show_cpuinfo(). + * 04/04/00 D.Mosberger renamed cpu_initialized to cpu_online_map + * 03/31/00 R.Seth cpu_initialized and current->processor fixes + * 02/04/00 D.Mosberger some more get_cpuinfo fixes... + * 02/01/00 R.Seth fixed get_cpuinfo for SMP + * 01/07/99 S.Eranian added the support for command line argument + * 06/24/99 W.Drummond added boot_cpu_data. + */ +#include <linux/config.h> +#include <linux/module.h> +#include <linux/init.h> + +#include <linux/acpi.h> +#include <linux/bootmem.h> +#include <linux/console.h> +#include <linux/delay.h> +#include <linux/kernel.h> +#include <linux/reboot.h> +#include <linux/sched.h> +#include <linux/seq_file.h> +#include <linux/string.h> +#include <linux/threads.h> +#include <linux/tty.h> +#include <linux/serial.h> +#include <linux/serial_core.h> +#include <linux/efi.h> +#include <linux/initrd.h> + +#include <asm/ia32.h> +#include <asm/machvec.h> +#include <asm/mca.h> +#include <asm/meminit.h> +#include <asm/page.h> +#include <asm/patch.h> +#include <asm/pgtable.h> +#include <asm/processor.h> +#include <asm/sal.h> +#include <asm/sections.h> +#include <asm/serial.h> +#include <asm/setup.h> +#include <asm/smp.h> +#include <asm/system.h> +#include <asm/unistd.h> + +#if defined(CONFIG_SMP) && (IA64_CPU_SIZE > PAGE_SIZE) +# error "struct cpuinfo_ia64 too big!" +#endif + +#ifdef CONFIG_SMP +unsigned long __per_cpu_offset[NR_CPUS]; +EXPORT_SYMBOL(__per_cpu_offset); +#endif + +DEFINE_PER_CPU(struct cpuinfo_ia64, cpu_info); +DEFINE_PER_CPU(unsigned long, local_per_cpu_offset); +DEFINE_PER_CPU(unsigned long, ia64_phys_stacked_size_p8); +unsigned long ia64_cycles_per_usec; +struct ia64_boot_param *ia64_boot_param; +struct screen_info screen_info; + +unsigned long ia64_max_cacheline_size; +unsigned long ia64_iobase; /* virtual address for I/O accesses */ +EXPORT_SYMBOL(ia64_iobase); +struct io_space io_space[MAX_IO_SPACES]; +EXPORT_SYMBOL(io_space); +unsigned int num_io_spaces; + +/* + * The merge_mask variable needs to be set to (max(iommu_page_size(iommu)) - 1). This + * mask specifies a mask of address bits that must be 0 in order for two buffers to be + * mergeable by the I/O MMU (i.e., the end address of the first buffer and the start + * address of the second buffer must be aligned to (merge_mask+1) in order to be + * mergeable). By default, we assume there is no I/O MMU which can merge physically + * discontiguous buffers, so we set the merge_mask to ~0UL, which corresponds to a iommu + * page-size of 2^64. + */ +unsigned long ia64_max_iommu_merge_mask = ~0UL; +EXPORT_SYMBOL(ia64_max_iommu_merge_mask); + +/* + * We use a special marker for the end of memory and it uses the extra (+1) slot + */ +struct rsvd_region rsvd_region[IA64_MAX_RSVD_REGIONS + 1]; +int num_rsvd_regions; + + +/* + * Filter incoming memory segments based on the primitive map created from the boot + * parameters. Segments contained in the map are removed from the memory ranges. A + * caller-specified function is called with the memory ranges that remain after filtering. + * This routine does not assume the incoming segments are sorted. + */ +int +filter_rsvd_memory (unsigned long start, unsigned long end, void *arg) +{ + unsigned long range_start, range_end, prev_start; + void (*func)(unsigned long, unsigned long, int); + int i; + +#if IGNORE_PFN0 + if (start == PAGE_OFFSET) { + printk(KERN_WARNING "warning: skipping physical page 0\n"); + start += PAGE_SIZE; + if (start >= end) return 0; + } +#endif + /* + * lowest possible address(walker uses virtual) + */ + prev_start = PAGE_OFFSET; + func = arg; + + for (i = 0; i < num_rsvd_regions; ++i) { + range_start = max(start, prev_start); + range_end = min(end, rsvd_region[i].start); + + if (range_start < range_end) + call_pernode_memory(__pa(range_start), range_end - range_start, func); + + /* nothing more available in this segment */ + if (range_end == end) return 0; + + prev_start = rsvd_region[i].end; + } + /* end of memory marker allows full processing inside loop body */ + return 0; +} + +static void +sort_regions (struct rsvd_region *rsvd_region, int max) +{ + int j; + + /* simple bubble sorting */ + while (max--) { + for (j = 0; j < max; ++j) { + if (rsvd_region[j].start > rsvd_region[j+1].start) { + struct rsvd_region tmp; + tmp = rsvd_region[j]; + rsvd_region[j] = rsvd_region[j + 1]; + rsvd_region[j + 1] = tmp; + } + } + } +} + +/** + * reserve_memory - setup reserved memory areas + * + * Setup the reserved memory areas set aside for the boot parameters, + * initrd, etc. There are currently %IA64_MAX_RSVD_REGIONS defined, + * see include/asm-ia64/meminit.h if you need to define more. + */ +void +reserve_memory (void) +{ + int n = 0; + + /* + * none of the entries in this table overlap + */ + rsvd_region[n].start = (unsigned long) ia64_boot_param; + rsvd_region[n].end = rsvd_region[n].start + sizeof(*ia64_boot_param); + n++; + + rsvd_region[n].start = (unsigned long) __va(ia64_boot_param->efi_memmap); + rsvd_region[n].end = rsvd_region[n].start + ia64_boot_param->efi_memmap_size; + n++; + + rsvd_region[n].start = (unsigned long) __va(ia64_boot_param->command_line); + rsvd_region[n].end = (rsvd_region[n].start + + strlen(__va(ia64_boot_param->command_line)) + 1); + n++; + + rsvd_region[n].start = (unsigned long) ia64_imva((void *)KERNEL_START); + rsvd_region[n].end = (unsigned long) ia64_imva(_end); + n++; + +#ifdef CONFIG_BLK_DEV_INITRD + if (ia64_boot_param->initrd_start) { + rsvd_region[n].start = (unsigned long)__va(ia64_boot_param->initrd_start); + rsvd_region[n].end = rsvd_region[n].start + ia64_boot_param->initrd_size; + n++; + } +#endif + + /* end of memory marker */ + rsvd_region[n].start = ~0UL; + rsvd_region[n].end = ~0UL; + n++; + + num_rsvd_regions = n; + + sort_regions(rsvd_region, num_rsvd_regions); +} + +/** + * find_initrd - get initrd parameters from the boot parameter structure + * + * Grab the initrd start and end from the boot parameter struct given us by + * the boot loader. + */ +void +find_initrd (void) +{ +#ifdef CONFIG_BLK_DEV_INITRD + if (ia64_boot_param->initrd_start) { + initrd_start = (unsigned long)__va(ia64_boot_param->initrd_start); + initrd_end = initrd_start+ia64_boot_param->initrd_size; + + printk(KERN_INFO "Initial ramdisk at: 0x%lx (%lu bytes)\n", + initrd_start, ia64_boot_param->initrd_size); + } +#endif +} + +static void __init +io_port_init (void) +{ + extern unsigned long ia64_iobase; + unsigned long phys_iobase; + + /* + * Set `iobase' to the appropriate address in region 6 (uncached access range). + * + * The EFI memory map is the "preferred" location to get the I/O port space base, + * rather the relying on AR.KR0. This should become more clear in future SAL + * specs. We'll fall back to getting it out of AR.KR0 if no appropriate entry is + * found in the memory map. + */ + phys_iobase = efi_get_iobase(); + if (phys_iobase) + /* set AR.KR0 since this is all we use it for anyway */ + ia64_set_kr(IA64_KR_IO_BASE, phys_iobase); + else { + phys_iobase = ia64_get_kr(IA64_KR_IO_BASE); + printk(KERN_INFO "No I/O port range found in EFI memory map, falling back " + "to AR.KR0\n"); + printk(KERN_INFO "I/O port base = 0x%lx\n", phys_iobase); + } + ia64_iobase = (unsigned long) ioremap(phys_iobase, 0); + + /* setup legacy IO port space */ + io_space[0].mmio_base = ia64_iobase; + io_space[0].sparse = 1; + num_io_spaces = 1; +} + +/** + * early_console_setup - setup debugging console + * + * Consoles started here require little enough setup that we can start using + * them very early in the boot process, either right after the machine + * vector initialization, or even before if the drivers can detect their hw. + * + * Returns non-zero if a console couldn't be setup. + */ +static inline int __init +early_console_setup (char *cmdline) +{ +#ifdef CONFIG_SERIAL_SGI_L1_CONSOLE + { + extern int sn_serial_console_early_setup(void); + if (!sn_serial_console_early_setup()) + return 0; + } +#endif +#ifdef CONFIG_EFI_PCDP + if (!efi_setup_pcdp_console(cmdline)) + return 0; +#endif +#ifdef CONFIG_SERIAL_8250_CONSOLE + if (!early_serial_console_init(cmdline)) + return 0; +#endif + + return -1; +} + +static inline void +mark_bsp_online (void) +{ +#ifdef CONFIG_SMP + /* If we register an early console, allow CPU 0 to printk */ + cpu_set(smp_processor_id(), cpu_online_map); +#endif +} + +void __init +setup_arch (char **cmdline_p) +{ + unw_init(); + + ia64_patch_vtop((u64) __start___vtop_patchlist, (u64) __end___vtop_patchlist); + + *cmdline_p = __va(ia64_boot_param->command_line); + strlcpy(saved_command_line, *cmdline_p, COMMAND_LINE_SIZE); + + efi_init(); + io_port_init(); + +#ifdef CONFIG_IA64_GENERIC + { + const char *mvec_name = strstr (*cmdline_p, "machvec="); + char str[64]; + + if (mvec_name) { + const char *end; + size_t len; + + mvec_name += 8; + end = strchr (mvec_name, ' '); + if (end) + len = end - mvec_name; + else + len = strlen (mvec_name); + len = min(len, sizeof (str) - 1); + strncpy (str, mvec_name, len); + str[len] = '\0'; + mvec_name = str; + } else + mvec_name = acpi_get_sysname(); + machvec_init(mvec_name); + } +#endif + + if (early_console_setup(*cmdline_p) == 0) + mark_bsp_online(); + +#ifdef CONFIG_ACPI_BOOT + /* Initialize the ACPI boot-time table parser */ + acpi_table_init(); +# ifdef CONFIG_ACPI_NUMA + acpi_numa_init(); +# endif +#else +# ifdef CONFIG_SMP + smp_build_cpu_map(); /* happens, e.g., with the Ski simulator */ +# endif +#endif /* CONFIG_APCI_BOOT */ + + find_memory(); + + /* process SAL system table: */ + ia64_sal_init(efi.sal_systab); + +#ifdef CONFIG_SMP + cpu_physical_id(0) = hard_smp_processor_id(); +#endif + + cpu_init(); /* initialize the bootstrap CPU */ + +#ifdef CONFIG_ACPI_BOOT + acpi_boot_init(); +#endif + +#ifdef CONFIG_VT + if (!conswitchp) { +# if defined(CONFIG_DUMMY_CONSOLE) + conswitchp = &dummy_con; +# endif +# if defined(CONFIG_VGA_CONSOLE) + /* + * Non-legacy systems may route legacy VGA MMIO range to system + * memory. vga_con probes the MMIO hole, so memory looks like + * a VGA device to it. The EFI memory map can tell us if it's + * memory so we can avoid this problem. + */ + if (efi_mem_type(0xA0000) != EFI_CONVENTIONAL_MEMORY) + conswitchp = &vga_con; +# endif + } +#endif + + /* enable IA-64 Machine Check Abort Handling unless disabled */ + if (!strstr(saved_command_line, "nomca")) + ia64_mca_init(); + + platform_setup(cmdline_p); + paging_init(); +} + +/* + * Display cpu info for all cpu's. + */ +static int +show_cpuinfo (struct seq_file *m, void *v) +{ +#ifdef CONFIG_SMP +# define lpj c->loops_per_jiffy +# define cpunum c->cpu +#else +# define lpj loops_per_jiffy +# define cpunum 0 +#endif + static struct { + unsigned long mask; + const char *feature_name; + } feature_bits[] = { + { 1UL << 0, "branchlong" }, + { 1UL << 1, "spontaneous deferral"}, + { 1UL << 2, "16-byte atomic ops" } + }; + char family[32], features[128], *cp, sep; + struct cpuinfo_ia64 *c = v; + unsigned long mask; + int i; + + mask = c->features; + + switch (c->family) { + case 0x07: memcpy(family, "Itanium", 8); break; + case 0x1f: memcpy(family, "Itanium 2", 10); break; + default: sprintf(family, "%u", c->family); break; + } + + /* build the feature string: */ + memcpy(features, " standard", 10); + cp = features; + sep = 0; + for (i = 0; i < (int) ARRAY_SIZE(feature_bits); ++i) { + if (mask & feature_bits[i].mask) { + if (sep) + *cp++ = sep; + sep = ','; + *cp++ = ' '; + strcpy(cp, feature_bits[i].feature_name); + cp += strlen(feature_bits[i].feature_name); + mask &= ~feature_bits[i].mask; + } + } + if (mask) { + /* print unknown features as a hex value: */ + if (sep) + *cp++ = sep; + sprintf(cp, " 0x%lx", mask); + } + + seq_printf(m, + "processor : %d\n" + "vendor : %s\n" + "arch : IA-64\n" + "family : %s\n" + "model : %u\n" + "revision : %u\n" + "archrev : %u\n" + "features :%s\n" /* don't change this---it _is_ right! */ + "cpu number : %lu\n" + "cpu regs : %u\n" + "cpu MHz : %lu.%06lu\n" + "itc MHz : %lu.%06lu\n" + "BogoMIPS : %lu.%02lu\n\n", + cpunum, c->vendor, family, c->model, c->revision, c->archrev, + features, c->ppn, c->number, + c->proc_freq / 1000000, c->proc_freq % 1000000, + c->itc_freq / 1000000, c->itc_freq % 1000000, + lpj*HZ/500000, (lpj*HZ/5000) % 100); + return 0; +} + +static void * +c_start (struct seq_file *m, loff_t *pos) +{ +#ifdef CONFIG_SMP + while (*pos < NR_CPUS && !cpu_isset(*pos, cpu_online_map)) + ++*pos; +#endif + return *pos < NR_CPUS ? cpu_data(*pos) : NULL; +} + +static void * +c_next (struct seq_file *m, void *v, loff_t *pos) +{ + ++*pos; + return c_start(m, pos); +} + +static void +c_stop (struct seq_file *m, void *v) +{ +} + +struct seq_operations cpuinfo_op = { + .start = c_start, + .next = c_next, + .stop = c_stop, + .show = show_cpuinfo +}; + +void +identify_cpu (struct cpuinfo_ia64 *c) +{ + union { + unsigned long bits[5]; + struct { + /* id 0 & 1: */ + char vendor[16]; + + /* id 2 */ + u64 ppn; /* processor serial number */ + + /* id 3: */ + unsigned number : 8; + unsigned revision : 8; + unsigned model : 8; + unsigned family : 8; + unsigned archrev : 8; + unsigned reserved : 24; + + /* id 4: */ + u64 features; + } field; + } cpuid; + pal_vm_info_1_u_t vm1; + pal_vm_info_2_u_t vm2; + pal_status_t status; + unsigned long impl_va_msb = 50, phys_addr_size = 44; /* Itanium defaults */ + int i; + + for (i = 0; i < 5; ++i) + cpuid.bits[i] = ia64_get_cpuid(i); + + memcpy(c->vendor, cpuid.field.vendor, 16); +#ifdef CONFIG_SMP + c->cpu = smp_processor_id(); +#endif + c->ppn = cpuid.field.ppn; + c->number = cpuid.field.number; + c->revision = cpuid.field.revision; + c->model = cpuid.field.model; + c->family = cpuid.field.family; + c->archrev = cpuid.field.archrev; + c->features = cpuid.field.features; + + status = ia64_pal_vm_summary(&vm1, &vm2); + if (status == PAL_STATUS_SUCCESS) { + impl_va_msb = vm2.pal_vm_info_2_s.impl_va_msb; + phys_addr_size = vm1.pal_vm_info_1_s.phys_add_size; + } + c->unimpl_va_mask = ~((7L<<61) | ((1L << (impl_va_msb + 1)) - 1)); + c->unimpl_pa_mask = ~((1L<<63) | ((1L << phys_addr_size) - 1)); +} + +void +setup_per_cpu_areas (void) +{ + /* start_kernel() requires this... */ +} + +static void +get_max_cacheline_size (void) +{ + unsigned long line_size, max = 1; + u64 l, levels, unique_caches; + pal_cache_config_info_t cci; + s64 status; + + status = ia64_pal_cache_summary(&levels, &unique_caches); + if (status != 0) { + printk(KERN_ERR "%s: ia64_pal_cache_summary() failed (status=%ld)\n", + __FUNCTION__, status); + max = SMP_CACHE_BYTES; + goto out; + } + + for (l = 0; l < levels; ++l) { + status = ia64_pal_cache_config_info(l, /* cache_type (data_or_unified)= */ 2, + &cci); + if (status != 0) { + printk(KERN_ERR + "%s: ia64_pal_cache_config_info(l=%lu) failed (status=%ld)\n", + __FUNCTION__, l, status); + max = SMP_CACHE_BYTES; + } + line_size = 1 << cci.pcci_line_size; + if (line_size > max) + max = line_size; + } + out: + if (max > ia64_max_cacheline_size) + ia64_max_cacheline_size = max; +} + +/* + * cpu_init() initializes state that is per-CPU. This function acts + * as a 'CPU state barrier', nothing should get across. + */ +void +cpu_init (void) +{ + extern void __devinit ia64_mmu_init (void *); + unsigned long num_phys_stacked; + pal_vm_info_2_u_t vmi; + unsigned int max_ctx; + struct cpuinfo_ia64 *cpu_info; + void *cpu_data; + + cpu_data = per_cpu_init(); + + /* + * We set ar.k3 so that assembly code in MCA handler can compute + * physical addresses of per cpu variables with a simple: + * phys = ar.k3 + &per_cpu_var + */ + ia64_set_kr(IA64_KR_PER_CPU_DATA, + ia64_tpa(cpu_data) - (long) __per_cpu_start); + + get_max_cacheline_size(); + + /* + * We can't pass "local_cpu_data" to identify_cpu() because we haven't called + * ia64_mmu_init() yet. And we can't call ia64_mmu_init() first because it + * depends on the data returned by identify_cpu(). We break the dependency by + * accessing cpu_data() through the canonical per-CPU address. + */ + cpu_info = cpu_data + ((char *) &__ia64_per_cpu_var(cpu_info) - __per_cpu_start); + identify_cpu(cpu_info); + +#ifdef CONFIG_MCKINLEY + { +# define FEATURE_SET 16 + struct ia64_pal_retval iprv; + + if (cpu_info->family == 0x1f) { + PAL_CALL_PHYS(iprv, PAL_PROC_GET_FEATURES, 0, FEATURE_SET, 0); + if ((iprv.status == 0) && (iprv.v0 & 0x80) && (iprv.v2 & 0x80)) + PAL_CALL_PHYS(iprv, PAL_PROC_SET_FEATURES, + (iprv.v1 | 0x80), FEATURE_SET, 0); + } + } +#endif + + /* Clear the stack memory reserved for pt_regs: */ + memset(ia64_task_regs(current), 0, sizeof(struct pt_regs)); + + ia64_set_kr(IA64_KR_FPU_OWNER, 0); + + /* + * Initialize the page-table base register to a global + * directory with all zeroes. This ensure that we can handle + * TLB-misses to user address-space even before we created the + * first user address-space. This may happen, e.g., due to + * aggressive use of lfetch.fault. + */ + ia64_set_kr(IA64_KR_PT_BASE, __pa(ia64_imva(empty_zero_page))); + + /* + * Initialize default control register to defer all speculative faults. The + * kernel MUST NOT depend on a particular setting of these bits (in other words, + * the kernel must have recovery code for all speculative accesses). Turn on + * dcr.lc as per recommendation by the architecture team. Most IA-32 apps + * shouldn't be affected by this (moral: keep your ia32 locks aligned and you'll + * be fine). + */ + ia64_setreg(_IA64_REG_CR_DCR, ( IA64_DCR_DP | IA64_DCR_DK | IA64_DCR_DX | IA64_DCR_DR + | IA64_DCR_DA | IA64_DCR_DD | IA64_DCR_LC)); + atomic_inc(&init_mm.mm_count); + current->active_mm = &init_mm; + if (current->mm) + BUG(); + + ia64_mmu_init(ia64_imva(cpu_data)); + ia64_mca_cpu_init(ia64_imva(cpu_data)); + +#ifdef CONFIG_IA32_SUPPORT + ia32_cpu_init(); +#endif + + /* Clear ITC to eliminiate sched_clock() overflows in human time. */ + ia64_set_itc(0); + + /* disable all local interrupt sources: */ + ia64_set_itv(1 << 16); + ia64_set_lrr0(1 << 16); + ia64_set_lrr1(1 << 16); + ia64_setreg(_IA64_REG_CR_PMV, 1 << 16); + ia64_setreg(_IA64_REG_CR_CMCV, 1 << 16); + + /* clear TPR & XTP to enable all interrupt classes: */ + ia64_setreg(_IA64_REG_CR_TPR, 0); +#ifdef CONFIG_SMP + normal_xtp(); +#endif + + /* set ia64_ctx.max_rid to the maximum RID that is supported by all CPUs: */ + if (ia64_pal_vm_summary(NULL, &vmi) == 0) + max_ctx = (1U << (vmi.pal_vm_info_2_s.rid_size - 3)) - 1; + else { + printk(KERN_WARNING "cpu_init: PAL VM summary failed, assuming 18 RID bits\n"); + max_ctx = (1U << 15) - 1; /* use architected minimum */ + } + while (max_ctx < ia64_ctx.max_ctx) { + unsigned int old = ia64_ctx.max_ctx; + if (cmpxchg(&ia64_ctx.max_ctx, old, max_ctx) == old) + break; + } + + if (ia64_pal_rse_info(&num_phys_stacked, NULL) != 0) { + printk(KERN_WARNING "cpu_init: PAL RSE info failed; assuming 96 physical " + "stacked regs\n"); + num_phys_stacked = 96; + } + /* size of physical stacked register partition plus 8 bytes: */ + __get_cpu_var(ia64_phys_stacked_size_p8) = num_phys_stacked*8 + 8; + platform_cpu_init(); +} + +void +check_bugs (void) +{ + ia64_patch_mckinley_e9((unsigned long) __start___mckinley_e9_bundles, + (unsigned long) __end___mckinley_e9_bundles); +} diff --git a/arch/ia64/kernel/sigframe.h b/arch/ia64/kernel/sigframe.h new file mode 100644 index 000000000000..37b986cb86e0 --- /dev/null +++ b/arch/ia64/kernel/sigframe.h @@ -0,0 +1,25 @@ +struct sigscratch { + unsigned long scratch_unat; /* ar.unat for the general registers saved in pt */ + unsigned long ar_pfs; /* for syscalls, the user-level function-state */ + struct pt_regs pt; +}; + +struct sigframe { + /* + * Place signal handler args where user-level unwinder can find them easily. + * DO NOT MOVE THESE. They are part of the IA-64 Linux ABI and there is + * user-level code that depends on their presence! + */ + unsigned long arg0; /* signum */ + unsigned long arg1; /* siginfo pointer */ + unsigned long arg2; /* sigcontext pointer */ + /* + * End of architected state. + */ + + void __user *handler; /* pointer to the plabel of the signal handler */ + struct siginfo info; + struct sigcontext sc; +}; + +extern long ia64_do_signal (sigset_t *, struct sigscratch *, long); diff --git a/arch/ia64/kernel/signal.c b/arch/ia64/kernel/signal.c new file mode 100644 index 000000000000..6891d86937d9 --- /dev/null +++ b/arch/ia64/kernel/signal.c @@ -0,0 +1,691 @@ +/* + * Architecture-specific signal handling support. + * + * Copyright (C) 1999-2004 Hewlett-Packard Co + * David Mosberger-Tang <davidm@hpl.hp.com> + * + * Derived from i386 and Alpha versions. + */ + +#include <linux/config.h> +#include <linux/errno.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/ptrace.h> +#include <linux/sched.h> +#include <linux/signal.h> +#include <linux/smp.h> +#include <linux/smp_lock.h> +#include <linux/stddef.h> +#include <linux/tty.h> +#include <linux/binfmts.h> +#include <linux/unistd.h> +#include <linux/wait.h> + +#include <asm/ia32.h> +#include <asm/intrinsics.h> +#include <asm/uaccess.h> +#include <asm/rse.h> +#include <asm/sigcontext.h> + +#include "sigframe.h" + +#define DEBUG_SIG 0 +#define STACK_ALIGN 16 /* minimal alignment for stack pointer */ +#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) + +#if _NSIG_WORDS > 1 +# define PUT_SIGSET(k,u) __copy_to_user((u)->sig, (k)->sig, sizeof(sigset_t)) +# define GET_SIGSET(k,u) __copy_from_user((k)->sig, (u)->sig, sizeof(sigset_t)) +#else +# define PUT_SIGSET(k,u) __put_user((k)->sig[0], &(u)->sig[0]) +# define GET_SIGSET(k,u) __get_user((k)->sig[0], &(u)->sig[0]) +#endif + +long +ia64_rt_sigsuspend (sigset_t __user *uset, size_t sigsetsize, struct sigscratch *scr) +{ + sigset_t oldset, set; + + /* XXX: Don't preclude handling different sized sigset_t's. */ + if (sigsetsize != sizeof(sigset_t)) + return -EINVAL; + + if (!access_ok(VERIFY_READ, uset, sigsetsize)) + return -EFAULT; + + if (GET_SIGSET(&set, uset)) + return -EFAULT; + + sigdelsetmask(&set, ~_BLOCKABLE); + + spin_lock_irq(¤t->sighand->siglock); + { + oldset = current->blocked; + current->blocked = set; + recalc_sigpending(); + } + spin_unlock_irq(¤t->sighand->siglock); + + /* + * The return below usually returns to the signal handler. We need to + * pre-set the correct error code here to ensure that the right values + * get saved in sigcontext by ia64_do_signal. + */ + scr->pt.r8 = EINTR; + scr->pt.r10 = -1; + + while (1) { + current->state = TASK_INTERRUPTIBLE; + schedule(); + if (ia64_do_signal(&oldset, scr, 1)) + return -EINTR; + } +} + +asmlinkage long +sys_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, long arg2, + long arg3, long arg4, long arg5, long arg6, long arg7, + struct pt_regs regs) +{ + return do_sigaltstack(uss, uoss, regs.r12); +} + +static long +restore_sigcontext (struct sigcontext __user *sc, struct sigscratch *scr) +{ + unsigned long ip, flags, nat, um, cfm; + long err; + + /* Always make any pending restarted system calls return -EINTR */ + current_thread_info()->restart_block.fn = do_no_restart_syscall; + + /* restore scratch that always needs gets updated during signal delivery: */ + err = __get_user(flags, &sc->sc_flags); + err |= __get_user(nat, &sc->sc_nat); + err |= __get_user(ip, &sc->sc_ip); /* instruction pointer */ + err |= __get_user(cfm, &sc->sc_cfm); + err |= __get_user(um, &sc->sc_um); /* user mask */ + err |= __get_user(scr->pt.ar_rsc, &sc->sc_ar_rsc); + err |= __get_user(scr->pt.ar_unat, &sc->sc_ar_unat); + err |= __get_user(scr->pt.ar_fpsr, &sc->sc_ar_fpsr); + err |= __get_user(scr->pt.ar_pfs, &sc->sc_ar_pfs); + err |= __get_user(scr->pt.pr, &sc->sc_pr); /* predicates */ + err |= __get_user(scr->pt.b0, &sc->sc_br[0]); /* b0 (rp) */ + err |= __get_user(scr->pt.b6, &sc->sc_br[6]); /* b6 */ + err |= __copy_from_user(&scr->pt.r1, &sc->sc_gr[1], 8); /* r1 */ + err |= __copy_from_user(&scr->pt.r8, &sc->sc_gr[8], 4*8); /* r8-r11 */ + err |= __copy_from_user(&scr->pt.r12, &sc->sc_gr[12], 2*8); /* r12-r13 */ + err |= __copy_from_user(&scr->pt.r15, &sc->sc_gr[15], 8); /* r15 */ + + scr->pt.cr_ifs = cfm | (1UL << 63); + + /* establish new instruction pointer: */ + scr->pt.cr_iip = ip & ~0x3UL; + ia64_psr(&scr->pt)->ri = ip & 0x3; + scr->pt.cr_ipsr = (scr->pt.cr_ipsr & ~IA64_PSR_UM) | (um & IA64_PSR_UM); + + scr->scratch_unat = ia64_put_scratch_nat_bits(&scr->pt, nat); + + if (!(flags & IA64_SC_FLAG_IN_SYSCALL)) { + /* Restore most scratch-state only when not in syscall. */ + err |= __get_user(scr->pt.ar_ccv, &sc->sc_ar_ccv); /* ar.ccv */ + err |= __get_user(scr->pt.b7, &sc->sc_br[7]); /* b7 */ + err |= __get_user(scr->pt.r14, &sc->sc_gr[14]); /* r14 */ + err |= __copy_from_user(&scr->pt.ar_csd, &sc->sc_ar25, 2*8); /* ar.csd & ar.ssd */ + err |= __copy_from_user(&scr->pt.r2, &sc->sc_gr[2], 2*8); /* r2-r3 */ + err |= __copy_from_user(&scr->pt.r16, &sc->sc_gr[16], 16*8); /* r16-r31 */ + } + + if ((flags & IA64_SC_FLAG_FPH_VALID) != 0) { + struct ia64_psr *psr = ia64_psr(&scr->pt); + + __copy_from_user(current->thread.fph, &sc->sc_fr[32], 96*16); + psr->mfh = 0; /* drop signal handler's fph contents... */ + if (psr->dfh) + ia64_drop_fpu(current); + else { + /* We already own the local fph, otherwise psr->dfh wouldn't be 0. */ + __ia64_load_fpu(current->thread.fph); + ia64_set_local_fpu_owner(current); + } + } + return err; +} + +int +copy_siginfo_to_user (siginfo_t __user *to, siginfo_t *from) +{ + if (!access_ok(VERIFY_WRITE, to, sizeof(siginfo_t))) + return -EFAULT; + if (from->si_code < 0) { + if (__copy_to_user(to, from, sizeof(siginfo_t))) + return -EFAULT; + return 0; + } else { + int err; + + /* + * If you change siginfo_t structure, please be sure this code is fixed + * accordingly. It should never copy any pad contained in the structure + * to avoid security leaks, but must copy the generic 3 ints plus the + * relevant union member. + */ + err = __put_user(from->si_signo, &to->si_signo); + err |= __put_user(from->si_errno, &to->si_errno); + err |= __put_user((short)from->si_code, &to->si_code); + switch (from->si_code >> 16) { + case __SI_FAULT >> 16: + err |= __put_user(from->si_flags, &to->si_flags); + err |= __put_user(from->si_isr, &to->si_isr); + case __SI_POLL >> 16: + err |= __put_user(from->si_addr, &to->si_addr); + err |= __put_user(from->si_imm, &to->si_imm); + break; + case __SI_TIMER >> 16: + err |= __put_user(from->si_tid, &to->si_tid); + err |= __put_user(from->si_overrun, &to->si_overrun); + err |= __put_user(from->si_ptr, &to->si_ptr); + break; + case __SI_RT >> 16: /* Not generated by the kernel as of now. */ + case __SI_MESGQ >> 16: + err |= __put_user(from->si_uid, &to->si_uid); + err |= __put_user(from->si_pid, &to->si_pid); + err |= __put_user(from->si_ptr, &to->si_ptr); + break; + case __SI_CHLD >> 16: + err |= __put_user(from->si_utime, &to->si_utime); + err |= __put_user(from->si_stime, &to->si_stime); + err |= __put_user(from->si_status, &to->si_status); + default: + err |= __put_user(from->si_uid, &to->si_uid); + err |= __put_user(from->si_pid, &to->si_pid); + break; + } + return err; + } +} + +long +ia64_rt_sigreturn (struct sigscratch *scr) +{ + extern char ia64_strace_leave_kernel, ia64_leave_kernel; + struct sigcontext __user *sc; + struct siginfo si; + sigset_t set; + long retval; + + sc = &((struct sigframe __user *) (scr->pt.r12 + 16))->sc; + + /* + * When we return to the previously executing context, r8 and r10 have already + * been setup the way we want them. Indeed, if the signal wasn't delivered while + * in a system call, we must not touch r8 or r10 as otherwise user-level state + * could be corrupted. + */ + retval = (long) &ia64_leave_kernel; + if (test_thread_flag(TIF_SYSCALL_TRACE)) + /* + * strace expects to be notified after sigreturn returns even though the + * context to which we return may not be in the middle of a syscall. + * Thus, the return-value that strace displays for sigreturn is + * meaningless. + */ + retval = (long) &ia64_strace_leave_kernel; + + if (!access_ok(VERIFY_READ, sc, sizeof(*sc))) + goto give_sigsegv; + + if (GET_SIGSET(&set, &sc->sc_mask)) + goto give_sigsegv; + + sigdelsetmask(&set, ~_BLOCKABLE); + + spin_lock_irq(¤t->sighand->siglock); + { + current->blocked = set; + recalc_sigpending(); + } + spin_unlock_irq(¤t->sighand->siglock); + + if (restore_sigcontext(sc, scr)) + goto give_sigsegv; + +#if DEBUG_SIG + printk("SIG return (%s:%d): sp=%lx ip=%lx\n", + current->comm, current->pid, scr->pt.r12, scr->pt.cr_iip); +#endif + /* + * It is more difficult to avoid calling this function than to + * call it and ignore errors. + */ + do_sigaltstack(&sc->sc_stack, NULL, scr->pt.r12); + return retval; + + give_sigsegv: + si.si_signo = SIGSEGV; + si.si_errno = 0; + si.si_code = SI_KERNEL; + si.si_pid = current->pid; + si.si_uid = current->uid; + si.si_addr = sc; + force_sig_info(SIGSEGV, &si, current); + return retval; +} + +/* + * This does just the minimum required setup of sigcontext. + * Specifically, it only installs data that is either not knowable at + * the user-level or that gets modified before execution in the + * trampoline starts. Everything else is done at the user-level. + */ +static long +setup_sigcontext (struct sigcontext __user *sc, sigset_t *mask, struct sigscratch *scr) +{ + unsigned long flags = 0, ifs, cfm, nat; + long err; + + ifs = scr->pt.cr_ifs; + + if (on_sig_stack((unsigned long) sc)) + flags |= IA64_SC_FLAG_ONSTACK; + if ((ifs & (1UL << 63)) == 0) + /* if cr_ifs doesn't have the valid bit set, we got here through a syscall */ + flags |= IA64_SC_FLAG_IN_SYSCALL; + cfm = ifs & ((1UL << 38) - 1); + ia64_flush_fph(current); + if ((current->thread.flags & IA64_THREAD_FPH_VALID)) { + flags |= IA64_SC_FLAG_FPH_VALID; + __copy_to_user(&sc->sc_fr[32], current->thread.fph, 96*16); + } + + nat = ia64_get_scratch_nat_bits(&scr->pt, scr->scratch_unat); + + err = __put_user(flags, &sc->sc_flags); + err |= __put_user(nat, &sc->sc_nat); + err |= PUT_SIGSET(mask, &sc->sc_mask); + err |= __put_user(cfm, &sc->sc_cfm); + err |= __put_user(scr->pt.cr_ipsr & IA64_PSR_UM, &sc->sc_um); + err |= __put_user(scr->pt.ar_rsc, &sc->sc_ar_rsc); + err |= __put_user(scr->pt.ar_unat, &sc->sc_ar_unat); /* ar.unat */ + err |= __put_user(scr->pt.ar_fpsr, &sc->sc_ar_fpsr); /* ar.fpsr */ + err |= __put_user(scr->pt.ar_pfs, &sc->sc_ar_pfs); + err |= __put_user(scr->pt.pr, &sc->sc_pr); /* predicates */ + err |= __put_user(scr->pt.b0, &sc->sc_br[0]); /* b0 (rp) */ + err |= __put_user(scr->pt.b6, &sc->sc_br[6]); /* b6 */ + err |= __copy_to_user(&sc->sc_gr[1], &scr->pt.r1, 8); /* r1 */ + err |= __copy_to_user(&sc->sc_gr[8], &scr->pt.r8, 4*8); /* r8-r11 */ + err |= __copy_to_user(&sc->sc_gr[12], &scr->pt.r12, 2*8); /* r12-r13 */ + err |= __copy_to_user(&sc->sc_gr[15], &scr->pt.r15, 8); /* r15 */ + err |= __put_user(scr->pt.cr_iip + ia64_psr(&scr->pt)->ri, &sc->sc_ip); + + if (flags & IA64_SC_FLAG_IN_SYSCALL) { + /* Clear scratch registers if the signal interrupted a system call. */ + err |= __put_user(0, &sc->sc_ar_ccv); /* ar.ccv */ + err |= __put_user(0, &sc->sc_br[7]); /* b7 */ + err |= __put_user(0, &sc->sc_gr[14]); /* r14 */ + err |= __clear_user(&sc->sc_ar25, 2*8); /* ar.csd & ar.ssd */ + err |= __clear_user(&sc->sc_gr[2], 2*8); /* r2-r3 */ + err |= __clear_user(&sc->sc_gr[16], 16*8); /* r16-r31 */ + } else { + /* Copy scratch regs to sigcontext if the signal didn't interrupt a syscall. */ + err |= __put_user(scr->pt.ar_ccv, &sc->sc_ar_ccv); /* ar.ccv */ + err |= __put_user(scr->pt.b7, &sc->sc_br[7]); /* b7 */ + err |= __put_user(scr->pt.r14, &sc->sc_gr[14]); /* r14 */ + err |= __copy_to_user(&sc->sc_ar25, &scr->pt.ar_csd, 2*8); /* ar.csd & ar.ssd */ + err |= __copy_to_user(&sc->sc_gr[2], &scr->pt.r2, 2*8); /* r2-r3 */ + err |= __copy_to_user(&sc->sc_gr[16], &scr->pt.r16, 16*8); /* r16-r31 */ + } + return err; +} + +/* + * Check whether the register-backing store is already on the signal stack. + */ +static inline int +rbs_on_sig_stack (unsigned long bsp) +{ + return (bsp - current->sas_ss_sp < current->sas_ss_size); +} + +static long +force_sigsegv_info (int sig, void __user *addr) +{ + unsigned long flags; + struct siginfo si; + + if (sig == SIGSEGV) { + /* + * Acquiring siglock around the sa_handler-update is almost + * certainly overkill, but this isn't a + * performance-critical path and I'd rather play it safe + * here than having to debug a nasty race if and when + * something changes in kernel/signal.c that would make it + * no longer safe to modify sa_handler without holding the + * lock. + */ + spin_lock_irqsave(¤t->sighand->siglock, flags); + current->sighand->action[sig - 1].sa.sa_handler = SIG_DFL; + spin_unlock_irqrestore(¤t->sighand->siglock, flags); + } + si.si_signo = SIGSEGV; + si.si_errno = 0; + si.si_code = SI_KERNEL; + si.si_pid = current->pid; + si.si_uid = current->uid; + si.si_addr = addr; + force_sig_info(SIGSEGV, &si, current); + return 0; +} + +static long +setup_frame (int sig, struct k_sigaction *ka, siginfo_t *info, sigset_t *set, + struct sigscratch *scr) +{ + extern char __kernel_sigtramp[]; + unsigned long tramp_addr, new_rbs = 0; + struct sigframe __user *frame; + long err; + + frame = (void __user *) scr->pt.r12; + tramp_addr = (unsigned long) __kernel_sigtramp; + if ((ka->sa.sa_flags & SA_ONSTACK) && sas_ss_flags((unsigned long) frame) == 0) { + frame = (void __user *) ((current->sas_ss_sp + current->sas_ss_size) + & ~(STACK_ALIGN - 1)); + /* + * We need to check for the register stack being on the signal stack + * separately, because it's switched separately (memory stack is switched + * in the kernel, register stack is switched in the signal trampoline). + */ + if (!rbs_on_sig_stack(scr->pt.ar_bspstore)) + new_rbs = (current->sas_ss_sp + sizeof(long) - 1) & ~(sizeof(long) - 1); + } + frame = (void __user *) frame - ((sizeof(*frame) + STACK_ALIGN - 1) & ~(STACK_ALIGN - 1)); + + if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) + return force_sigsegv_info(sig, frame); + + err = __put_user(sig, &frame->arg0); + err |= __put_user(&frame->info, &frame->arg1); + err |= __put_user(&frame->sc, &frame->arg2); + err |= __put_user(new_rbs, &frame->sc.sc_rbs_base); + err |= __put_user(0, &frame->sc.sc_loadrs); /* initialize to zero */ + err |= __put_user(ka->sa.sa_handler, &frame->handler); + + err |= copy_siginfo_to_user(&frame->info, info); + + err |= __put_user(current->sas_ss_sp, &frame->sc.sc_stack.ss_sp); + err |= __put_user(current->sas_ss_size, &frame->sc.sc_stack.ss_size); + err |= __put_user(sas_ss_flags(scr->pt.r12), &frame->sc.sc_stack.ss_flags); + err |= setup_sigcontext(&frame->sc, set, scr); + + if (unlikely(err)) + return force_sigsegv_info(sig, frame); + + scr->pt.r12 = (unsigned long) frame - 16; /* new stack pointer */ + scr->pt.ar_fpsr = FPSR_DEFAULT; /* reset fpsr for signal handler */ + scr->pt.cr_iip = tramp_addr; + ia64_psr(&scr->pt)->ri = 0; /* start executing in first slot */ + ia64_psr(&scr->pt)->be = 0; /* force little-endian byte-order */ + /* + * Force the interruption function mask to zero. This has no effect when a + * system-call got interrupted by a signal (since, in that case, scr->pt_cr_ifs is + * ignored), but it has the desirable effect of making it possible to deliver a + * signal with an incomplete register frame (which happens when a mandatory RSE + * load faults). Furthermore, it has no negative effect on the getting the user's + * dirty partition preserved, because that's governed by scr->pt.loadrs. + */ + scr->pt.cr_ifs = (1UL << 63); + + /* + * Note: this affects only the NaT bits of the scratch regs (the ones saved in + * pt_regs), which is exactly what we want. + */ + scr->scratch_unat = 0; /* ensure NaT bits of r12 is clear */ + +#if DEBUG_SIG + printk("SIG deliver (%s:%d): sig=%d sp=%lx ip=%lx handler=%p\n", + current->comm, current->pid, sig, scr->pt.r12, frame->sc.sc_ip, frame->handler); +#endif + return 1; +} + +static long +handle_signal (unsigned long sig, struct k_sigaction *ka, siginfo_t *info, sigset_t *oldset, + struct sigscratch *scr) +{ + if (IS_IA32_PROCESS(&scr->pt)) { + /* send signal to IA-32 process */ + if (!ia32_setup_frame1(sig, ka, info, oldset, &scr->pt)) + return 0; + } else + /* send signal to IA-64 process */ + if (!setup_frame(sig, ka, info, oldset, scr)) + return 0; + + if (!(ka->sa.sa_flags & SA_NODEFER)) { + spin_lock_irq(¤t->sighand->siglock); + { + sigorsets(¤t->blocked, ¤t->blocked, &ka->sa.sa_mask); + sigaddset(¤t->blocked, sig); + recalc_sigpending(); + } + spin_unlock_irq(¤t->sighand->siglock); + } + return 1; +} + +/* + * Note that `init' is a special process: it doesn't get signals it doesn't want to + * handle. Thus you cannot kill init even with a SIGKILL even by mistake. + */ +long +ia64_do_signal (sigset_t *oldset, struct sigscratch *scr, long in_syscall) +{ + struct k_sigaction ka; + siginfo_t info; + long restart = in_syscall; + long errno = scr->pt.r8; +# define ERR_CODE(c) (IS_IA32_PROCESS(&scr->pt) ? -(c) : (c)) + + /* + * In the ia64_leave_kernel code path, we want the common case to go fast, which + * is why we may in certain cases get here from kernel mode. Just return without + * doing anything if so. + */ + if (!user_mode(&scr->pt)) + return 0; + + if (!oldset) + oldset = ¤t->blocked; + + /* + * This only loops in the rare cases of handle_signal() failing, in which case we + * need to push through a forced SIGSEGV. + */ + while (1) { + int signr = get_signal_to_deliver(&info, &ka, &scr->pt, NULL); + + /* + * get_signal_to_deliver() may have run a debugger (via notify_parent()) + * and the debugger may have modified the state (e.g., to arrange for an + * inferior call), thus it's important to check for restarting _after_ + * get_signal_to_deliver(). + */ + if (IS_IA32_PROCESS(&scr->pt)) { + if (in_syscall) { + if (errno >= 0) + restart = 0; + else + errno = -errno; + } + } else if ((long) scr->pt.r10 != -1) + /* + * A system calls has to be restarted only if one of the error codes + * ERESTARTNOHAND, ERESTARTSYS, or ERESTARTNOINTR is returned. If r10 + * isn't -1 then r8 doesn't hold an error code and we don't need to + * restart the syscall, so we can clear the "restart" flag here. + */ + restart = 0; + + if (signr <= 0) + break; + + if (unlikely(restart)) { + switch (errno) { + case ERESTART_RESTARTBLOCK: + case ERESTARTNOHAND: + scr->pt.r8 = ERR_CODE(EINTR); + /* note: scr->pt.r10 is already -1 */ + break; + + case ERESTARTSYS: + if ((ka.sa.sa_flags & SA_RESTART) == 0) { + scr->pt.r8 = ERR_CODE(EINTR); + /* note: scr->pt.r10 is already -1 */ + break; + } + case ERESTARTNOINTR: + if (IS_IA32_PROCESS(&scr->pt)) { + scr->pt.r8 = scr->pt.r1; + scr->pt.cr_iip -= 2; + } else + ia64_decrement_ip(&scr->pt); + restart = 0; /* don't restart twice if handle_signal() fails... */ + } + } + + /* + * Whee! Actually deliver the signal. If the delivery failed, we need to + * continue to iterate in this loop so we can deliver the SIGSEGV... + */ + if (handle_signal(signr, &ka, &info, oldset, scr)) + return 1; + } + + /* Did we come from a system call? */ + if (restart) { + /* Restart the system call - no handlers present */ + if (errno == ERESTARTNOHAND || errno == ERESTARTSYS || errno == ERESTARTNOINTR + || errno == ERESTART_RESTARTBLOCK) + { + if (IS_IA32_PROCESS(&scr->pt)) { + scr->pt.r8 = scr->pt.r1; + scr->pt.cr_iip -= 2; + if (errno == ERESTART_RESTARTBLOCK) + scr->pt.r8 = 0; /* x86 version of __NR_restart_syscall */ + } else { + /* + * Note: the syscall number is in r15 which is saved in + * pt_regs so all we need to do here is adjust ip so that + * the "break" instruction gets re-executed. + */ + ia64_decrement_ip(&scr->pt); + if (errno == ERESTART_RESTARTBLOCK) + scr->pt.r15 = __NR_restart_syscall; + } + } + } + return 0; +} + +/* Set a delayed signal that was detected in MCA/INIT/NMI/PMI context where it + * could not be delivered. It is important that the target process is not + * allowed to do any more work in user space. Possible cases for the target + * process: + * + * - It is sleeping and will wake up soon. Store the data in the current task, + * the signal will be sent when the current task returns from the next + * interrupt. + * + * - It is running in user context. Store the data in the current task, the + * signal will be sent when the current task returns from the next interrupt. + * + * - It is running in kernel context on this or another cpu and will return to + * user context. Store the data in the target task, the signal will be sent + * to itself when the target task returns to user space. + * + * - It is running in kernel context on this cpu and will sleep before + * returning to user context. Because this is also the current task, the + * signal will not get delivered and the task could sleep indefinitely. + * Store the data in the idle task for this cpu, the signal will be sent + * after the idle task processes its next interrupt. + * + * To cover all cases, store the data in the target task, the current task and + * the idle task on this cpu. Whatever happens, the signal will be delivered + * to the target task before it can do any useful user space work. Multiple + * deliveries have no unwanted side effects. + * + * Note: This code is executed in MCA/INIT/NMI/PMI context, with interrupts + * disabled. It must not take any locks nor use kernel structures or services + * that require locks. + */ + +/* To ensure that we get the right pid, check its start time. To avoid extra + * include files in thread_info.h, convert the task start_time to unsigned long, + * giving us a cycle time of > 580 years. + */ +static inline unsigned long +start_time_ul(const struct task_struct *t) +{ + return t->start_time.tv_sec * NSEC_PER_SEC + t->start_time.tv_nsec; +} + +void +set_sigdelayed(pid_t pid, int signo, int code, void __user *addr) +{ + struct task_struct *t; + unsigned long start_time = 0; + int i; + + for (i = 1; i <= 3; ++i) { + switch (i) { + case 1: + t = find_task_by_pid(pid); + if (t) + start_time = start_time_ul(t); + break; + case 2: + t = current; + break; + default: + t = idle_task(smp_processor_id()); + break; + } + + if (!t) + return; + t->thread_info->sigdelayed.signo = signo; + t->thread_info->sigdelayed.code = code; + t->thread_info->sigdelayed.addr = addr; + t->thread_info->sigdelayed.start_time = start_time; + t->thread_info->sigdelayed.pid = pid; + wmb(); + set_tsk_thread_flag(t, TIF_SIGDELAYED); + } +} + +/* Called from entry.S when it detects TIF_SIGDELAYED, a delayed signal that + * was detected in MCA/INIT/NMI/PMI context where it could not be delivered. + */ + +void +do_sigdelayed(void) +{ + struct siginfo siginfo; + pid_t pid; + struct task_struct *t; + + clear_thread_flag(TIF_SIGDELAYED); + memset(&siginfo, 0, sizeof(siginfo)); + siginfo.si_signo = current_thread_info()->sigdelayed.signo; + siginfo.si_code = current_thread_info()->sigdelayed.code; + siginfo.si_addr = current_thread_info()->sigdelayed.addr; + pid = current_thread_info()->sigdelayed.pid; + t = find_task_by_pid(pid); + if (!t) + return; + if (current_thread_info()->sigdelayed.start_time != start_time_ul(t)) + return; + force_sig_info(siginfo.si_signo, &siginfo, t); +} diff --git a/arch/ia64/kernel/smp.c b/arch/ia64/kernel/smp.c new file mode 100644 index 000000000000..953095e2ce15 --- /dev/null +++ b/arch/ia64/kernel/smp.c @@ -0,0 +1,376 @@ +/* + * SMP Support + * + * Copyright (C) 1999 Walt Drummond <drummond@valinux.com> + * Copyright (C) 1999, 2001, 2003 David Mosberger-Tang <davidm@hpl.hp.com> + * + * Lots of stuff stolen from arch/alpha/kernel/smp.c + * + * 01/05/16 Rohit Seth <rohit.seth@intel.com> IA64-SMP functions. Reorganized + * the existing code (on the lines of x86 port). + * 00/09/11 David Mosberger <davidm@hpl.hp.com> Do loops_per_jiffy + * calibration on each CPU. + * 00/08/23 Asit Mallick <asit.k.mallick@intel.com> fixed logical processor id + * 00/03/31 Rohit Seth <rohit.seth@intel.com> Fixes for Bootstrap Processor + * & cpu_online_map now gets done here (instead of setup.c) + * 99/10/05 davidm Update to bring it in sync with new command-line processing + * scheme. + * 10/13/00 Goutham Rao <goutham.rao@intel.com> Updated smp_call_function and + * smp_call_function_single to resend IPI on timeouts + */ +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/init.h> +#include <linux/interrupt.h> +#include <linux/smp.h> +#include <linux/kernel_stat.h> +#include <linux/mm.h> +#include <linux/cache.h> +#include <linux/delay.h> +#include <linux/efi.h> +#include <linux/bitops.h> + +#include <asm/atomic.h> +#include <asm/current.h> +#include <asm/delay.h> +#include <asm/machvec.h> +#include <asm/io.h> +#include <asm/irq.h> +#include <asm/page.h> +#include <asm/pgalloc.h> +#include <asm/pgtable.h> +#include <asm/processor.h> +#include <asm/ptrace.h> +#include <asm/sal.h> +#include <asm/system.h> +#include <asm/tlbflush.h> +#include <asm/unistd.h> +#include <asm/mca.h> + +/* + * Structure and data for smp_call_function(). This is designed to minimise static memory + * requirements. It also looks cleaner. + */ +static __cacheline_aligned DEFINE_SPINLOCK(call_lock); + +struct call_data_struct { + void (*func) (void *info); + void *info; + long wait; + atomic_t started; + atomic_t finished; +}; + +static volatile struct call_data_struct *call_data; + +#define IPI_CALL_FUNC 0 +#define IPI_CPU_STOP 1 + +/* This needs to be cacheline aligned because it is written to by *other* CPUs. */ +static DEFINE_PER_CPU(u64, ipi_operation) ____cacheline_aligned; + +extern void cpu_halt (void); + +void +lock_ipi_calllock(void) +{ + spin_lock_irq(&call_lock); +} + +void +unlock_ipi_calllock(void) +{ + spin_unlock_irq(&call_lock); +} + +static void +stop_this_cpu (void) +{ + /* + * Remove this CPU: + */ + cpu_clear(smp_processor_id(), cpu_online_map); + max_xtp(); + local_irq_disable(); + cpu_halt(); +} + +void +cpu_die(void) +{ + max_xtp(); + local_irq_disable(); + cpu_halt(); + /* Should never be here */ + BUG(); + for (;;); +} + +irqreturn_t +handle_IPI (int irq, void *dev_id, struct pt_regs *regs) +{ + int this_cpu = get_cpu(); + unsigned long *pending_ipis = &__ia64_per_cpu_var(ipi_operation); + unsigned long ops; + + mb(); /* Order interrupt and bit testing. */ + while ((ops = xchg(pending_ipis, 0)) != 0) { + mb(); /* Order bit clearing and data access. */ + do { + unsigned long which; + + which = ffz(~ops); + ops &= ~(1 << which); + + switch (which) { + case IPI_CALL_FUNC: + { + struct call_data_struct *data; + void (*func)(void *info); + void *info; + int wait; + + /* release the 'pointer lock' */ + data = (struct call_data_struct *) call_data; + func = data->func; + info = data->info; + wait = data->wait; + + mb(); + atomic_inc(&data->started); + /* + * At this point the structure may be gone unless + * wait is true. + */ + (*func)(info); + + /* Notify the sending CPU that the task is done. */ + mb(); + if (wait) + atomic_inc(&data->finished); + } + break; + + case IPI_CPU_STOP: + stop_this_cpu(); + break; + + default: + printk(KERN_CRIT "Unknown IPI on CPU %d: %lu\n", this_cpu, which); + break; + } + } while (ops); + mb(); /* Order data access and bit testing. */ + } + put_cpu(); + return IRQ_HANDLED; +} + +/* + * Called with preeemption disabled. + */ +static inline void +send_IPI_single (int dest_cpu, int op) +{ + set_bit(op, &per_cpu(ipi_operation, dest_cpu)); + platform_send_ipi(dest_cpu, IA64_IPI_VECTOR, IA64_IPI_DM_INT, 0); +} + +/* + * Called with preeemption disabled. + */ +static inline void +send_IPI_allbutself (int op) +{ + unsigned int i; + + for (i = 0; i < NR_CPUS; i++) { + if (cpu_online(i) && i != smp_processor_id()) + send_IPI_single(i, op); + } +} + +/* + * Called with preeemption disabled. + */ +static inline void +send_IPI_all (int op) +{ + int i; + + for (i = 0; i < NR_CPUS; i++) + if (cpu_online(i)) + send_IPI_single(i, op); +} + +/* + * Called with preeemption disabled. + */ +static inline void +send_IPI_self (int op) +{ + send_IPI_single(smp_processor_id(), op); +} + +/* + * Called with preeemption disabled. + */ +void +smp_send_reschedule (int cpu) +{ + platform_send_ipi(cpu, IA64_IPI_RESCHEDULE, IA64_IPI_DM_INT, 0); +} + +void +smp_flush_tlb_all (void) +{ + on_each_cpu((void (*)(void *))local_flush_tlb_all, NULL, 1, 1); +} + +void +smp_flush_tlb_mm (struct mm_struct *mm) +{ + /* this happens for the common case of a single-threaded fork(): */ + if (likely(mm == current->active_mm && atomic_read(&mm->mm_users) == 1)) + { + local_finish_flush_tlb_mm(mm); + return; + } + + /* + * We could optimize this further by using mm->cpu_vm_mask to track which CPUs + * have been running in the address space. It's not clear that this is worth the + * trouble though: to avoid races, we have to raise the IPI on the target CPU + * anyhow, and once a CPU is interrupted, the cost of local_flush_tlb_all() is + * rather trivial. + */ + on_each_cpu((void (*)(void *))local_finish_flush_tlb_mm, mm, 1, 1); +} + +/* + * Run a function on another CPU + * <func> The function to run. This must be fast and non-blocking. + * <info> An arbitrary pointer to pass to the function. + * <nonatomic> Currently unused. + * <wait> If true, wait until function has completed on other CPUs. + * [RETURNS] 0 on success, else a negative status code. + * + * Does not return until the remote CPU is nearly ready to execute <func> + * or is or has executed. + */ + +int +smp_call_function_single (int cpuid, void (*func) (void *info), void *info, int nonatomic, + int wait) +{ + struct call_data_struct data; + int cpus = 1; + int me = get_cpu(); /* prevent preemption and reschedule on another processor */ + + if (cpuid == me) { + printk("%s: trying to call self\n", __FUNCTION__); + put_cpu(); + return -EBUSY; + } + + data.func = func; + data.info = info; + atomic_set(&data.started, 0); + data.wait = wait; + if (wait) + atomic_set(&data.finished, 0); + + spin_lock_bh(&call_lock); + + call_data = &data; + mb(); /* ensure store to call_data precedes setting of IPI_CALL_FUNC */ + send_IPI_single(cpuid, IPI_CALL_FUNC); + + /* Wait for response */ + while (atomic_read(&data.started) != cpus) + cpu_relax(); + + if (wait) + while (atomic_read(&data.finished) != cpus) + cpu_relax(); + call_data = NULL; + + spin_unlock_bh(&call_lock); + put_cpu(); + return 0; +} +EXPORT_SYMBOL(smp_call_function_single); + +/* + * this function sends a 'generic call function' IPI to all other CPUs + * in the system. + */ + +/* + * [SUMMARY] Run a function on all other CPUs. + * <func> The function to run. This must be fast and non-blocking. + * <info> An arbitrary pointer to pass to the function. + * <nonatomic> currently unused. + * <wait> If true, wait (atomically) until function has completed on other CPUs. + * [RETURNS] 0 on success, else a negative status code. + * + * Does not return until remote CPUs are nearly ready to execute <func> or are or have + * executed. + * + * You must not call this function with disabled interrupts or from a + * hardware interrupt handler or from a bottom half handler. + */ +int +smp_call_function (void (*func) (void *info), void *info, int nonatomic, int wait) +{ + struct call_data_struct data; + int cpus = num_online_cpus()-1; + + if (!cpus) + return 0; + + /* Can deadlock when called with interrupts disabled */ + WARN_ON(irqs_disabled()); + + data.func = func; + data.info = info; + atomic_set(&data.started, 0); + data.wait = wait; + if (wait) + atomic_set(&data.finished, 0); + + spin_lock(&call_lock); + + call_data = &data; + mb(); /* ensure store to call_data precedes setting of IPI_CALL_FUNC */ + send_IPI_allbutself(IPI_CALL_FUNC); + + /* Wait for response */ + while (atomic_read(&data.started) != cpus) + cpu_relax(); + + if (wait) + while (atomic_read(&data.finished) != cpus) + cpu_relax(); + call_data = NULL; + + spin_unlock(&call_lock); + return 0; +} +EXPORT_SYMBOL(smp_call_function); + +/* + * this function calls the 'stop' function on all other CPUs in the system. + */ +void +smp_send_stop (void) +{ + send_IPI_allbutself(IPI_CPU_STOP); +} + +int __init +setup_profiling_timer (unsigned int multiplier) +{ + return -EINVAL; +} diff --git a/arch/ia64/kernel/smpboot.c b/arch/ia64/kernel/smpboot.c new file mode 100644 index 000000000000..5318f0cbfc26 --- /dev/null +++ b/arch/ia64/kernel/smpboot.c @@ -0,0 +1,692 @@ +/* + * SMP boot-related support + * + * Copyright (C) 1998-2003 Hewlett-Packard Co + * David Mosberger-Tang <davidm@hpl.hp.com> + * + * 01/05/16 Rohit Seth <rohit.seth@intel.com> Moved SMP booting functions from smp.c to here. + * 01/04/27 David Mosberger <davidm@hpl.hp.com> Added ITC synching code. + * 02/07/31 David Mosberger <davidm@hpl.hp.com> Switch over to hotplug-CPU boot-sequence. + * smp_boot_cpus()/smp_commence() is replaced by + * smp_prepare_cpus()/__cpu_up()/smp_cpus_done(). + */ +#include <linux/config.h> + +#include <linux/module.h> +#include <linux/acpi.h> +#include <linux/bootmem.h> +#include <linux/cpu.h> +#include <linux/delay.h> +#include <linux/init.h> +#include <linux/interrupt.h> +#include <linux/irq.h> +#include <linux/kernel.h> +#include <linux/kernel_stat.h> +#include <linux/mm.h> +#include <linux/notifier.h> +#include <linux/smp.h> +#include <linux/smp_lock.h> +#include <linux/spinlock.h> +#include <linux/efi.h> +#include <linux/percpu.h> +#include <linux/bitops.h> + +#include <asm/atomic.h> +#include <asm/cache.h> +#include <asm/current.h> +#include <asm/delay.h> +#include <asm/ia32.h> +#include <asm/io.h> +#include <asm/irq.h> +#include <asm/machvec.h> +#include <asm/mca.h> +#include <asm/page.h> +#include <asm/pgalloc.h> +#include <asm/pgtable.h> +#include <asm/processor.h> +#include <asm/ptrace.h> +#include <asm/sal.h> +#include <asm/system.h> +#include <asm/tlbflush.h> +#include <asm/unistd.h> + +#define SMP_DEBUG 0 + +#if SMP_DEBUG +#define Dprintk(x...) printk(x) +#else +#define Dprintk(x...) +#endif + + +/* + * ITC synchronization related stuff: + */ +#define MASTER 0 +#define SLAVE (SMP_CACHE_BYTES/8) + +#define NUM_ROUNDS 64 /* magic value */ +#define NUM_ITERS 5 /* likewise */ + +static DEFINE_SPINLOCK(itc_sync_lock); +static volatile unsigned long go[SLAVE + 1]; + +#define DEBUG_ITC_SYNC 0 + +extern void __devinit calibrate_delay (void); +extern void start_ap (void); +extern unsigned long ia64_iobase; + +task_t *task_for_booting_cpu; + +/* + * State for each CPU + */ +DEFINE_PER_CPU(int, cpu_state); + +/* Bitmasks of currently online, and possible CPUs */ +cpumask_t cpu_online_map; +EXPORT_SYMBOL(cpu_online_map); +cpumask_t cpu_possible_map; +EXPORT_SYMBOL(cpu_possible_map); + +/* which logical CPU number maps to which CPU (physical APIC ID) */ +volatile int ia64_cpu_to_sapicid[NR_CPUS]; +EXPORT_SYMBOL(ia64_cpu_to_sapicid); + +static volatile cpumask_t cpu_callin_map; + +struct smp_boot_data smp_boot_data __initdata; + +unsigned long ap_wakeup_vector = -1; /* External Int use to wakeup APs */ + +char __initdata no_int_routing; + +unsigned char smp_int_redirect; /* are INT and IPI redirectable by the chipset? */ + +static int __init +nointroute (char *str) +{ + no_int_routing = 1; + printk ("no_int_routing on\n"); + return 1; +} + +__setup("nointroute", nointroute); + +void +sync_master (void *arg) +{ + unsigned long flags, i; + + go[MASTER] = 0; + + local_irq_save(flags); + { + for (i = 0; i < NUM_ROUNDS*NUM_ITERS; ++i) { + while (!go[MASTER]); + go[MASTER] = 0; + go[SLAVE] = ia64_get_itc(); + } + } + local_irq_restore(flags); +} + +/* + * Return the number of cycles by which our itc differs from the itc on the master + * (time-keeper) CPU. A positive number indicates our itc is ahead of the master, + * negative that it is behind. + */ +static inline long +get_delta (long *rt, long *master) +{ + unsigned long best_t0 = 0, best_t1 = ~0UL, best_tm = 0; + unsigned long tcenter, t0, t1, tm; + long i; + + for (i = 0; i < NUM_ITERS; ++i) { + t0 = ia64_get_itc(); + go[MASTER] = 1; + while (!(tm = go[SLAVE])); + go[SLAVE] = 0; + t1 = ia64_get_itc(); + + if (t1 - t0 < best_t1 - best_t0) + best_t0 = t0, best_t1 = t1, best_tm = tm; + } + + *rt = best_t1 - best_t0; + *master = best_tm - best_t0; + + /* average best_t0 and best_t1 without overflow: */ + tcenter = (best_t0/2 + best_t1/2); + if (best_t0 % 2 + best_t1 % 2 == 2) + ++tcenter; + return tcenter - best_tm; +} + +/* + * Synchronize ar.itc of the current (slave) CPU with the ar.itc of the MASTER CPU + * (normally the time-keeper CPU). We use a closed loop to eliminate the possibility of + * unaccounted-for errors (such as getting a machine check in the middle of a calibration + * step). The basic idea is for the slave to ask the master what itc value it has and to + * read its own itc before and after the master responds. Each iteration gives us three + * timestamps: + * + * slave master + * + * t0 ---\ + * ---\ + * ---> + * tm + * /--- + * /--- + * t1 <--- + * + * + * The goal is to adjust the slave's ar.itc such that tm falls exactly half-way between t0 + * and t1. If we achieve this, the clocks are synchronized provided the interconnect + * between the slave and the master is symmetric. Even if the interconnect were + * asymmetric, we would still know that the synchronization error is smaller than the + * roundtrip latency (t0 - t1). + * + * When the interconnect is quiet and symmetric, this lets us synchronize the itc to + * within one or two cycles. However, we can only *guarantee* that the synchronization is + * accurate to within a round-trip time, which is typically in the range of several + * hundred cycles (e.g., ~500 cycles). In practice, this means that the itc's are usually + * almost perfectly synchronized, but we shouldn't assume that the accuracy is much better + * than half a micro second or so. + */ +void +ia64_sync_itc (unsigned int master) +{ + long i, delta, adj, adjust_latency = 0, done = 0; + unsigned long flags, rt, master_time_stamp, bound; +#if DEBUG_ITC_SYNC + struct { + long rt; /* roundtrip time */ + long master; /* master's timestamp */ + long diff; /* difference between midpoint and master's timestamp */ + long lat; /* estimate of itc adjustment latency */ + } t[NUM_ROUNDS]; +#endif + + /* + * Make sure local timer ticks are disabled while we sync. If + * they were enabled, we'd have to worry about nasty issues + * like setting the ITC ahead of (or a long time before) the + * next scheduled tick. + */ + BUG_ON((ia64_get_itv() & (1 << 16)) == 0); + + go[MASTER] = 1; + + if (smp_call_function_single(master, sync_master, NULL, 1, 0) < 0) { + printk(KERN_ERR "sync_itc: failed to get attention of CPU %u!\n", master); + return; + } + + while (go[MASTER]); /* wait for master to be ready */ + + spin_lock_irqsave(&itc_sync_lock, flags); + { + for (i = 0; i < NUM_ROUNDS; ++i) { + delta = get_delta(&rt, &master_time_stamp); + if (delta == 0) { + done = 1; /* let's lock on to this... */ + bound = rt; + } + + if (!done) { + if (i > 0) { + adjust_latency += -delta; + adj = -delta + adjust_latency/4; + } else + adj = -delta; + + ia64_set_itc(ia64_get_itc() + adj); + } +#if DEBUG_ITC_SYNC + t[i].rt = rt; + t[i].master = master_time_stamp; + t[i].diff = delta; + t[i].lat = adjust_latency/4; +#endif + } + } + spin_unlock_irqrestore(&itc_sync_lock, flags); + +#if DEBUG_ITC_SYNC + for (i = 0; i < NUM_ROUNDS; ++i) + printk("rt=%5ld master=%5ld diff=%5ld adjlat=%5ld\n", + t[i].rt, t[i].master, t[i].diff, t[i].lat); +#endif + + printk(KERN_INFO "CPU %d: synchronized ITC with CPU %u (last diff %ld cycles, " + "maxerr %lu cycles)\n", smp_processor_id(), master, delta, rt); +} + +/* + * Ideally sets up per-cpu profiling hooks. Doesn't do much now... + */ +static inline void __devinit +smp_setup_percpu_timer (void) +{ +} + +static void __devinit +smp_callin (void) +{ + int cpuid, phys_id; + extern void ia64_init_itm(void); + +#ifdef CONFIG_PERFMON + extern void pfm_init_percpu(void); +#endif + + cpuid = smp_processor_id(); + phys_id = hard_smp_processor_id(); + + if (cpu_online(cpuid)) { + printk(KERN_ERR "huh, phys CPU#0x%x, CPU#0x%x already present??\n", + phys_id, cpuid); + BUG(); + } + + lock_ipi_calllock(); + cpu_set(cpuid, cpu_online_map); + unlock_ipi_calllock(); + + smp_setup_percpu_timer(); + + ia64_mca_cmc_vector_setup(); /* Setup vector on AP */ + +#ifdef CONFIG_PERFMON + pfm_init_percpu(); +#endif + + local_irq_enable(); + + if (!(sal_platform_features & IA64_SAL_PLATFORM_FEATURE_ITC_DRIFT)) { + /* + * Synchronize the ITC with the BP. Need to do this after irqs are + * enabled because ia64_sync_itc() calls smp_call_function_single(), which + * calls spin_unlock_bh(), which calls spin_unlock_bh(), which calls + * local_bh_enable(), which bugs out if irqs are not enabled... + */ + Dprintk("Going to syncup ITC with BP.\n"); + ia64_sync_itc(0); + } + + /* + * Get our bogomips. + */ + ia64_init_itm(); + calibrate_delay(); + local_cpu_data->loops_per_jiffy = loops_per_jiffy; + +#ifdef CONFIG_IA32_SUPPORT + ia32_gdt_init(); +#endif + + /* + * Allow the master to continue. + */ + cpu_set(cpuid, cpu_callin_map); + Dprintk("Stack on CPU %d at about %p\n",cpuid, &cpuid); +} + + +/* + * Activate a secondary processor. head.S calls this. + */ +int __devinit +start_secondary (void *unused) +{ + /* Early console may use I/O ports */ + ia64_set_kr(IA64_KR_IO_BASE, __pa(ia64_iobase)); + + Dprintk("start_secondary: starting CPU 0x%x\n", hard_smp_processor_id()); + efi_map_pal_code(); + cpu_init(); + smp_callin(); + + cpu_idle(); + return 0; +} + +struct pt_regs * __devinit idle_regs(struct pt_regs *regs) +{ + return NULL; +} + +struct create_idle { + struct task_struct *idle; + struct completion done; + int cpu; +}; + +void +do_fork_idle(void *_c_idle) +{ + struct create_idle *c_idle = _c_idle; + + c_idle->idle = fork_idle(c_idle->cpu); + complete(&c_idle->done); +} + +static int __devinit +do_boot_cpu (int sapicid, int cpu) +{ + int timeout; + struct create_idle c_idle = { + .cpu = cpu, + .done = COMPLETION_INITIALIZER(c_idle.done), + }; + DECLARE_WORK(work, do_fork_idle, &c_idle); + /* + * We can't use kernel_thread since we must avoid to reschedule the child. + */ + if (!keventd_up() || current_is_keventd()) + work.func(work.data); + else { + schedule_work(&work); + wait_for_completion(&c_idle.done); + } + + if (IS_ERR(c_idle.idle)) + panic("failed fork for CPU %d", cpu); + task_for_booting_cpu = c_idle.idle; + + Dprintk("Sending wakeup vector %lu to AP 0x%x/0x%x.\n", ap_wakeup_vector, cpu, sapicid); + + platform_send_ipi(cpu, ap_wakeup_vector, IA64_IPI_DM_INT, 0); + + /* + * Wait 10s total for the AP to start + */ + Dprintk("Waiting on callin_map ..."); + for (timeout = 0; timeout < 100000; timeout++) { + if (cpu_isset(cpu, cpu_callin_map)) + break; /* It has booted */ + udelay(100); + } + Dprintk("\n"); + + if (!cpu_isset(cpu, cpu_callin_map)) { + printk(KERN_ERR "Processor 0x%x/0x%x is stuck.\n", cpu, sapicid); + ia64_cpu_to_sapicid[cpu] = -1; + cpu_clear(cpu, cpu_online_map); /* was set in smp_callin() */ + return -EINVAL; + } + return 0; +} + +static int __init +decay (char *str) +{ + int ticks; + get_option (&str, &ticks); + return 1; +} + +__setup("decay=", decay); + +/* + * Initialize the logical CPU number to SAPICID mapping + */ +void __init +smp_build_cpu_map (void) +{ + int sapicid, cpu, i; + int boot_cpu_id = hard_smp_processor_id(); + + for (cpu = 0; cpu < NR_CPUS; cpu++) { + ia64_cpu_to_sapicid[cpu] = -1; +#ifdef CONFIG_HOTPLUG_CPU + cpu_set(cpu, cpu_possible_map); +#endif + } + + ia64_cpu_to_sapicid[0] = boot_cpu_id; + cpus_clear(cpu_present_map); + cpu_set(0, cpu_present_map); + cpu_set(0, cpu_possible_map); + for (cpu = 1, i = 0; i < smp_boot_data.cpu_count; i++) { + sapicid = smp_boot_data.cpu_phys_id[i]; + if (sapicid == boot_cpu_id) + continue; + cpu_set(cpu, cpu_present_map); + cpu_set(cpu, cpu_possible_map); + ia64_cpu_to_sapicid[cpu] = sapicid; + cpu++; + } +} + +#ifdef CONFIG_NUMA + +/* on which node is each logical CPU (one cacheline even for 64 CPUs) */ +u8 cpu_to_node_map[NR_CPUS] __cacheline_aligned; +EXPORT_SYMBOL(cpu_to_node_map); +/* which logical CPUs are on which nodes */ +cpumask_t node_to_cpu_mask[MAX_NUMNODES] __cacheline_aligned; + +/* + * Build cpu to node mapping and initialize the per node cpu masks. + */ +void __init +build_cpu_to_node_map (void) +{ + int cpu, i, node; + + for(node=0; node<MAX_NUMNODES; node++) + cpus_clear(node_to_cpu_mask[node]); + for(cpu = 0; cpu < NR_CPUS; ++cpu) { + /* + * All Itanium NUMA platforms I know use ACPI, so maybe we + * can drop this ifdef completely. [EF] + */ +#ifdef CONFIG_ACPI_NUMA + node = -1; + for (i = 0; i < NR_CPUS; ++i) + if (cpu_physical_id(cpu) == node_cpuid[i].phys_id) { + node = node_cpuid[i].nid; + break; + } +#else +# error Fixme: Dunno how to build CPU-to-node map. +#endif + cpu_to_node_map[cpu] = (node >= 0) ? node : 0; + if (node >= 0) + cpu_set(cpu, node_to_cpu_mask[node]); + } +} + +#endif /* CONFIG_NUMA */ + +/* + * Cycle through the APs sending Wakeup IPIs to boot each. + */ +void __init +smp_prepare_cpus (unsigned int max_cpus) +{ + int boot_cpu_id = hard_smp_processor_id(); + + /* + * Initialize the per-CPU profiling counter/multiplier + */ + + smp_setup_percpu_timer(); + + /* + * We have the boot CPU online for sure. + */ + cpu_set(0, cpu_online_map); + cpu_set(0, cpu_callin_map); + + local_cpu_data->loops_per_jiffy = loops_per_jiffy; + ia64_cpu_to_sapicid[0] = boot_cpu_id; + + printk(KERN_INFO "Boot processor id 0x%x/0x%x\n", 0, boot_cpu_id); + + current_thread_info()->cpu = 0; + + /* + * If SMP should be disabled, then really disable it! + */ + if (!max_cpus) { + printk(KERN_INFO "SMP mode deactivated.\n"); + cpus_clear(cpu_online_map); + cpus_clear(cpu_present_map); + cpus_clear(cpu_possible_map); + cpu_set(0, cpu_online_map); + cpu_set(0, cpu_present_map); + cpu_set(0, cpu_possible_map); + return; + } +} + +void __devinit smp_prepare_boot_cpu(void) +{ + cpu_set(smp_processor_id(), cpu_online_map); + cpu_set(smp_processor_id(), cpu_callin_map); +} + +#ifdef CONFIG_HOTPLUG_CPU +extern void fixup_irqs(void); +/* must be called with cpucontrol mutex held */ +static int __devinit cpu_enable(unsigned int cpu) +{ + per_cpu(cpu_state,cpu) = CPU_UP_PREPARE; + wmb(); + + while (!cpu_online(cpu)) + cpu_relax(); + return 0; +} + +int __cpu_disable(void) +{ + int cpu = smp_processor_id(); + + /* + * dont permit boot processor for now + */ + if (cpu == 0) + return -EBUSY; + + fixup_irqs(); + local_flush_tlb_all(); + printk ("Disabled cpu %u\n", smp_processor_id()); + return 0; +} + +void __cpu_die(unsigned int cpu) +{ + unsigned int i; + + for (i = 0; i < 100; i++) { + /* They ack this in play_dead by setting CPU_DEAD */ + if (per_cpu(cpu_state, cpu) == CPU_DEAD) + { + /* + * TBD: Enable this when physical removal + * or when we put the processor is put in + * SAL_BOOT_RENDEZ mode + * cpu_clear(cpu, cpu_callin_map); + */ + return; + } + msleep(100); + } + printk(KERN_ERR "CPU %u didn't die...\n", cpu); +} +#else /* !CONFIG_HOTPLUG_CPU */ +static int __devinit cpu_enable(unsigned int cpu) +{ + return 0; +} + +int __cpu_disable(void) +{ + return -ENOSYS; +} + +void __cpu_die(unsigned int cpu) +{ + /* We said "no" in __cpu_disable */ + BUG(); +} +#endif /* CONFIG_HOTPLUG_CPU */ + +void +smp_cpus_done (unsigned int dummy) +{ + int cpu; + unsigned long bogosum = 0; + + /* + * Allow the user to impress friends. + */ + + for (cpu = 0; cpu < NR_CPUS; cpu++) + if (cpu_online(cpu)) + bogosum += cpu_data(cpu)->loops_per_jiffy; + + printk(KERN_INFO "Total of %d processors activated (%lu.%02lu BogoMIPS).\n", + (int)num_online_cpus(), bogosum/(500000/HZ), (bogosum/(5000/HZ))%100); +} + +int __devinit +__cpu_up (unsigned int cpu) +{ + int ret; + int sapicid; + + sapicid = ia64_cpu_to_sapicid[cpu]; + if (sapicid == -1) + return -EINVAL; + + /* + * Already booted.. just enable and get outa idle lool + */ + if (cpu_isset(cpu, cpu_callin_map)) + { + cpu_enable(cpu); + local_irq_enable(); + while (!cpu_isset(cpu, cpu_online_map)) + mb(); + return 0; + } + /* Processor goes to start_secondary(), sets online flag */ + ret = do_boot_cpu(sapicid, cpu); + if (ret < 0) + return ret; + + return 0; +} + +/* + * Assume that CPU's have been discovered by some platform-dependent interface. For + * SoftSDV/Lion, that would be ACPI. + * + * Setup of the IPI irq handler is done in irq.c:init_IRQ_SMP(). + */ +void __init +init_smp_config(void) +{ + struct fptr { + unsigned long fp; + unsigned long gp; + } *ap_startup; + long sal_ret; + + /* Tell SAL where to drop the AP's. */ + ap_startup = (struct fptr *) start_ap; + sal_ret = ia64_sal_set_vectors(SAL_VECTOR_OS_BOOT_RENDEZ, + ia64_tpa(ap_startup->fp), ia64_tpa(ap_startup->gp), 0, 0, 0, 0); + if (sal_ret < 0) + printk(KERN_ERR "SMP: Can't set SAL AP Boot Rendezvous: %s\n", + ia64_sal_strerror(sal_ret)); +} + diff --git a/arch/ia64/kernel/sys_ia64.c b/arch/ia64/kernel/sys_ia64.c new file mode 100644 index 000000000000..3ac216e1c8bb --- /dev/null +++ b/arch/ia64/kernel/sys_ia64.c @@ -0,0 +1,298 @@ +/* + * This file contains various system calls that have different calling + * conventions on different platforms. + * + * Copyright (C) 1999-2000, 2002-2003, 2005 Hewlett-Packard Co + * David Mosberger-Tang <davidm@hpl.hp.com> + */ +#include <linux/config.h> +#include <linux/errno.h> +#include <linux/fs.h> +#include <linux/mm.h> +#include <linux/mman.h> +#include <linux/sched.h> +#include <linux/shm.h> +#include <linux/file.h> /* doh, must come after sched.h... */ +#include <linux/smp.h> +#include <linux/smp_lock.h> +#include <linux/syscalls.h> +#include <linux/highuid.h> +#include <linux/hugetlb.h> + +#include <asm/shmparam.h> +#include <asm/uaccess.h> + +unsigned long +arch_get_unmapped_area (struct file *filp, unsigned long addr, unsigned long len, + unsigned long pgoff, unsigned long flags) +{ + long map_shared = (flags & MAP_SHARED); + unsigned long start_addr, align_mask = PAGE_SIZE - 1; + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + + if (len > RGN_MAP_LIMIT) + return -ENOMEM; + +#ifdef CONFIG_HUGETLB_PAGE + if (REGION_NUMBER(addr) == REGION_HPAGE) + addr = 0; +#endif + if (!addr) + addr = mm->free_area_cache; + + if (map_shared && (TASK_SIZE > 0xfffffffful)) + /* + * For 64-bit tasks, align shared segments to 1MB to avoid potential + * performance penalty due to virtual aliasing (see ASDM). For 32-bit + * tasks, we prefer to avoid exhausting the address space too quickly by + * limiting alignment to a single page. + */ + align_mask = SHMLBA - 1; + + full_search: + start_addr = addr = (addr + align_mask) & ~align_mask; + + for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { + /* At this point: (!vma || addr < vma->vm_end). */ + if (TASK_SIZE - len < addr || RGN_MAP_LIMIT - len < REGION_OFFSET(addr)) { + if (start_addr != TASK_UNMAPPED_BASE) { + /* Start a new search --- just in case we missed some holes. */ + addr = TASK_UNMAPPED_BASE; + goto full_search; + } + return -ENOMEM; + } + if (!vma || addr + len <= vma->vm_start) { + /* Remember the address where we stopped this search: */ + mm->free_area_cache = addr + len; + return addr; + } + addr = (vma->vm_end + align_mask) & ~align_mask; + } +} + +asmlinkage long +ia64_getpriority (int which, int who) +{ + long prio; + + prio = sys_getpriority(which, who); + if (prio >= 0) { + force_successful_syscall_return(); + prio = 20 - prio; + } + return prio; +} + +/* XXX obsolete, but leave it here until the old libc is gone... */ +asmlinkage unsigned long +sys_getpagesize (void) +{ + return PAGE_SIZE; +} + +asmlinkage unsigned long +ia64_shmat (int shmid, void __user *shmaddr, int shmflg) +{ + unsigned long raddr; + int retval; + + retval = do_shmat(shmid, shmaddr, shmflg, &raddr); + if (retval < 0) + return retval; + + force_successful_syscall_return(); + return raddr; +} + +asmlinkage unsigned long +ia64_brk (unsigned long brk) +{ + unsigned long rlim, retval, newbrk, oldbrk; + struct mm_struct *mm = current->mm; + + /* + * Most of this replicates the code in sys_brk() except for an additional safety + * check and the clearing of r8. However, we can't call sys_brk() because we need + * to acquire the mmap_sem before we can do the test... + */ + down_write(&mm->mmap_sem); + + if (brk < mm->end_code) + goto out; + newbrk = PAGE_ALIGN(brk); + oldbrk = PAGE_ALIGN(mm->brk); + if (oldbrk == newbrk) + goto set_brk; + + /* Always allow shrinking brk. */ + if (brk <= mm->brk) { + if (!do_munmap(mm, newbrk, oldbrk-newbrk)) + goto set_brk; + goto out; + } + + /* Check against unimplemented/unmapped addresses: */ + if ((newbrk - oldbrk) > RGN_MAP_LIMIT || REGION_OFFSET(newbrk) > RGN_MAP_LIMIT) + goto out; + + /* Check against rlimit.. */ + rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur; + if (rlim < RLIM_INFINITY && brk - mm->start_data > rlim) + goto out; + + /* Check against existing mmap mappings. */ + if (find_vma_intersection(mm, oldbrk, newbrk+PAGE_SIZE)) + goto out; + + /* Ok, looks good - let it rip. */ + if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk) + goto out; +set_brk: + mm->brk = brk; +out: + retval = mm->brk; + up_write(&mm->mmap_sem); + force_successful_syscall_return(); + return retval; +} + +/* + * On IA-64, we return the two file descriptors in ret0 and ret1 (r8 + * and r9) as this is faster than doing a copy_to_user(). + */ +asmlinkage long +sys_pipe (void) +{ + struct pt_regs *regs = ia64_task_regs(current); + int fd[2]; + int retval; + + retval = do_pipe(fd); + if (retval) + goto out; + retval = fd[0]; + regs->r9 = fd[1]; + out: + return retval; +} + +static inline unsigned long +do_mmap2 (unsigned long addr, unsigned long len, int prot, int flags, int fd, unsigned long pgoff) +{ + unsigned long roff; + struct file *file = NULL; + + flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); + if (!(flags & MAP_ANONYMOUS)) { + file = fget(fd); + if (!file) + return -EBADF; + + if (!file->f_op || !file->f_op->mmap) { + addr = -ENODEV; + goto out; + } + } + + /* + * A zero mmap always succeeds in Linux, independent of whether or not the + * remaining arguments are valid. + */ + if (len == 0) + goto out; + + /* Careful about overflows.. */ + len = PAGE_ALIGN(len); + if (!len || len > TASK_SIZE) { + addr = -EINVAL; + goto out; + } + + /* + * Don't permit mappings into unmapped space, the virtual page table of a region, + * or across a region boundary. Note: RGN_MAP_LIMIT is equal to 2^n-PAGE_SIZE + * (for some integer n <= 61) and len > 0. + */ + roff = REGION_OFFSET(addr); + if ((len > RGN_MAP_LIMIT) || (roff > (RGN_MAP_LIMIT - len))) { + addr = -EINVAL; + goto out; + } + + down_write(¤t->mm->mmap_sem); + addr = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); + up_write(¤t->mm->mmap_sem); + +out: if (file) + fput(file); + return addr; +} + +/* + * mmap2() is like mmap() except that the offset is expressed in units + * of PAGE_SIZE (instead of bytes). This allows to mmap2() (pieces + * of) files that are larger than the address space of the CPU. + */ +asmlinkage unsigned long +sys_mmap2 (unsigned long addr, unsigned long len, int prot, int flags, int fd, long pgoff) +{ + addr = do_mmap2(addr, len, prot, flags, fd, pgoff); + if (!IS_ERR((void *) addr)) + force_successful_syscall_return(); + return addr; +} + +asmlinkage unsigned long +sys_mmap (unsigned long addr, unsigned long len, int prot, int flags, int fd, long off) +{ + if (offset_in_page(off) != 0) + return -EINVAL; + + addr = do_mmap2(addr, len, prot, flags, fd, off >> PAGE_SHIFT); + if (!IS_ERR((void *) addr)) + force_successful_syscall_return(); + return addr; +} + +asmlinkage unsigned long +ia64_mremap (unsigned long addr, unsigned long old_len, unsigned long new_len, unsigned long flags, + unsigned long new_addr) +{ + extern unsigned long do_mremap (unsigned long addr, + unsigned long old_len, + unsigned long new_len, + unsigned long flags, + unsigned long new_addr); + + down_write(¤t->mm->mmap_sem); + { + addr = do_mremap(addr, old_len, new_len, flags, new_addr); + } + up_write(¤t->mm->mmap_sem); + + if (IS_ERR((void *) addr)) + return addr; + + force_successful_syscall_return(); + return addr; +} + +#ifndef CONFIG_PCI + +asmlinkage long +sys_pciconfig_read (unsigned long bus, unsigned long dfn, unsigned long off, unsigned long len, + void *buf) +{ + return -ENOSYS; +} + +asmlinkage long +sys_pciconfig_write (unsigned long bus, unsigned long dfn, unsigned long off, unsigned long len, + void *buf) +{ + return -ENOSYS; +} + +#endif /* CONFIG_PCI */ diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c new file mode 100644 index 000000000000..8b8a5a45b621 --- /dev/null +++ b/arch/ia64/kernel/time.c @@ -0,0 +1,255 @@ +/* + * linux/arch/ia64/kernel/time.c + * + * Copyright (C) 1998-2003 Hewlett-Packard Co + * Stephane Eranian <eranian@hpl.hp.com> + * David Mosberger <davidm@hpl.hp.com> + * Copyright (C) 1999 Don Dugger <don.dugger@intel.com> + * Copyright (C) 1999-2000 VA Linux Systems + * Copyright (C) 1999-2000 Walt Drummond <drummond@valinux.com> + */ +#include <linux/config.h> + +#include <linux/cpu.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/profile.h> +#include <linux/sched.h> +#include <linux/time.h> +#include <linux/interrupt.h> +#include <linux/efi.h> +#include <linux/profile.h> +#include <linux/timex.h> + +#include <asm/machvec.h> +#include <asm/delay.h> +#include <asm/hw_irq.h> +#include <asm/ptrace.h> +#include <asm/sal.h> +#include <asm/sections.h> +#include <asm/system.h> + +extern unsigned long wall_jiffies; + +u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES; + +EXPORT_SYMBOL(jiffies_64); + +#define TIME_KEEPER_ID 0 /* smp_processor_id() of time-keeper */ + +#ifdef CONFIG_IA64_DEBUG_IRQ + +unsigned long last_cli_ip; +EXPORT_SYMBOL(last_cli_ip); + +#endif + +static struct time_interpolator itc_interpolator = { + .shift = 16, + .mask = 0xffffffffffffffffLL, + .source = TIME_SOURCE_CPU +}; + +static irqreturn_t +timer_interrupt (int irq, void *dev_id, struct pt_regs *regs) +{ + unsigned long new_itm; + + if (unlikely(cpu_is_offline(smp_processor_id()))) { + return IRQ_HANDLED; + } + + platform_timer_interrupt(irq, dev_id, regs); + + new_itm = local_cpu_data->itm_next; + + if (!time_after(ia64_get_itc(), new_itm)) + printk(KERN_ERR "Oops: timer tick before it's due (itc=%lx,itm=%lx)\n", + ia64_get_itc(), new_itm); + + profile_tick(CPU_PROFILING, regs); + + while (1) { + update_process_times(user_mode(regs)); + + new_itm += local_cpu_data->itm_delta; + + if (smp_processor_id() == TIME_KEEPER_ID) { + /* + * Here we are in the timer irq handler. We have irqs locally + * disabled, but we don't know if the timer_bh is running on + * another CPU. We need to avoid to SMP race by acquiring the + * xtime_lock. + */ + write_seqlock(&xtime_lock); + do_timer(regs); + local_cpu_data->itm_next = new_itm; + write_sequnlock(&xtime_lock); + } else + local_cpu_data->itm_next = new_itm; + + if (time_after(new_itm, ia64_get_itc())) + break; + } + + do { + /* + * If we're too close to the next clock tick for + * comfort, we increase the safety margin by + * intentionally dropping the next tick(s). We do NOT + * update itm.next because that would force us to call + * do_timer() which in turn would let our clock run + * too fast (with the potentially devastating effect + * of losing monotony of time). + */ + while (!time_after(new_itm, ia64_get_itc() + local_cpu_data->itm_delta/2)) + new_itm += local_cpu_data->itm_delta; + ia64_set_itm(new_itm); + /* double check, in case we got hit by a (slow) PMI: */ + } while (time_after_eq(ia64_get_itc(), new_itm)); + return IRQ_HANDLED; +} + +/* + * Encapsulate access to the itm structure for SMP. + */ +void +ia64_cpu_local_tick (void) +{ + int cpu = smp_processor_id(); + unsigned long shift = 0, delta; + + /* arrange for the cycle counter to generate a timer interrupt: */ + ia64_set_itv(IA64_TIMER_VECTOR); + + delta = local_cpu_data->itm_delta; + /* + * Stagger the timer tick for each CPU so they don't occur all at (almost) the + * same time: + */ + if (cpu) { + unsigned long hi = 1UL << ia64_fls(cpu); + shift = (2*(cpu - hi) + 1) * delta/hi/2; + } + local_cpu_data->itm_next = ia64_get_itc() + delta + shift; + ia64_set_itm(local_cpu_data->itm_next); +} + +static int nojitter; + +static int __init nojitter_setup(char *str) +{ + nojitter = 1; + printk("Jitter checking for ITC timers disabled\n"); + return 1; +} + +__setup("nojitter", nojitter_setup); + + +void __devinit +ia64_init_itm (void) +{ + unsigned long platform_base_freq, itc_freq; + struct pal_freq_ratio itc_ratio, proc_ratio; + long status, platform_base_drift, itc_drift; + + /* + * According to SAL v2.6, we need to use a SAL call to determine the platform base + * frequency and then a PAL call to determine the frequency ratio between the ITC + * and the base frequency. + */ + status = ia64_sal_freq_base(SAL_FREQ_BASE_PLATFORM, + &platform_base_freq, &platform_base_drift); + if (status != 0) { + printk(KERN_ERR "SAL_FREQ_BASE_PLATFORM failed: %s\n", ia64_sal_strerror(status)); + } else { + status = ia64_pal_freq_ratios(&proc_ratio, NULL, &itc_ratio); + if (status != 0) + printk(KERN_ERR "PAL_FREQ_RATIOS failed with status=%ld\n", status); + } + if (status != 0) { + /* invent "random" values */ + printk(KERN_ERR + "SAL/PAL failed to obtain frequency info---inventing reasonable values\n"); + platform_base_freq = 100000000; + platform_base_drift = -1; /* no drift info */ + itc_ratio.num = 3; + itc_ratio.den = 1; + } + if (platform_base_freq < 40000000) { + printk(KERN_ERR "Platform base frequency %lu bogus---resetting to 75MHz!\n", + platform_base_freq); + platform_base_freq = 75000000; + platform_base_drift = -1; + } + if (!proc_ratio.den) + proc_ratio.den = 1; /* avoid division by zero */ + if (!itc_ratio.den) + itc_ratio.den = 1; /* avoid division by zero */ + + itc_freq = (platform_base_freq*itc_ratio.num)/itc_ratio.den; + + local_cpu_data->itm_delta = (itc_freq + HZ/2) / HZ; + printk(KERN_DEBUG "CPU %d: base freq=%lu.%03luMHz, ITC ratio=%lu/%lu, " + "ITC freq=%lu.%03luMHz", smp_processor_id(), + platform_base_freq / 1000000, (platform_base_freq / 1000) % 1000, + itc_ratio.num, itc_ratio.den, itc_freq / 1000000, (itc_freq / 1000) % 1000); + + if (platform_base_drift != -1) { + itc_drift = platform_base_drift*itc_ratio.num/itc_ratio.den; + printk("+/-%ldppm\n", itc_drift); + } else { + itc_drift = -1; + printk("\n"); + } + + local_cpu_data->proc_freq = (platform_base_freq*proc_ratio.num)/proc_ratio.den; + local_cpu_data->itc_freq = itc_freq; + local_cpu_data->cyc_per_usec = (itc_freq + USEC_PER_SEC/2) / USEC_PER_SEC; + local_cpu_data->nsec_per_cyc = ((NSEC_PER_SEC<<IA64_NSEC_PER_CYC_SHIFT) + + itc_freq/2)/itc_freq; + + if (!(sal_platform_features & IA64_SAL_PLATFORM_FEATURE_ITC_DRIFT)) { + itc_interpolator.frequency = local_cpu_data->itc_freq; + itc_interpolator.drift = itc_drift; +#ifdef CONFIG_SMP + /* On IA64 in an SMP configuration ITCs are never accurately synchronized. + * Jitter compensation requires a cmpxchg which may limit + * the scalability of the syscalls for retrieving time. + * The ITC synchronization is usually successful to within a few + * ITC ticks but this is not a sure thing. If you need to improve + * timer performance in SMP situations then boot the kernel with the + * "nojitter" option. However, doing so may result in time fluctuating (maybe + * even going backward) if the ITC offsets between the individual CPUs + * are too large. + */ + if (!nojitter) itc_interpolator.jitter = 1; +#endif + register_time_interpolator(&itc_interpolator); + } + + /* Setup the CPU local timer tick */ + ia64_cpu_local_tick(); +} + +static struct irqaction timer_irqaction = { + .handler = timer_interrupt, + .flags = SA_INTERRUPT, + .name = "timer" +}; + +void __init +time_init (void) +{ + register_percpu_irq(IA64_TIMER_VECTOR, &timer_irqaction); + efi_gettimeofday(&xtime); + ia64_init_itm(); + + /* + * Initialize wall_to_monotonic such that adding it to xtime will yield zero, the + * tv_nsec field must be normalized (i.e., 0 <= nsec < NSEC_PER_SEC). + */ + set_normalized_timespec(&wall_to_monotonic, -xtime.tv_sec, -xtime.tv_nsec); +} diff --git a/arch/ia64/kernel/topology.c b/arch/ia64/kernel/topology.c new file mode 100644 index 000000000000..f1aafd4c05f9 --- /dev/null +++ b/arch/ia64/kernel/topology.c @@ -0,0 +1,92 @@ +/* + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + * + * This file contains NUMA specific variables and functions which can + * be split away from DISCONTIGMEM and are used on NUMA machines with + * contiguous memory. + * 2002/08/07 Erich Focht <efocht@ess.nec.de> + * Populate cpu entries in sysfs for non-numa systems as well + * Intel Corporation - Ashok Raj + */ + +#include <linux/config.h> +#include <linux/cpu.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/node.h> +#include <linux/init.h> +#include <linux/bootmem.h> +#include <linux/nodemask.h> +#include <asm/mmzone.h> +#include <asm/numa.h> +#include <asm/cpu.h> + +#ifdef CONFIG_NUMA +static struct node *sysfs_nodes; +#endif +static struct ia64_cpu *sysfs_cpus; + +int arch_register_cpu(int num) +{ + struct node *parent = NULL; + +#ifdef CONFIG_NUMA + parent = &sysfs_nodes[cpu_to_node(num)]; +#endif /* CONFIG_NUMA */ + + return register_cpu(&sysfs_cpus[num].cpu, num, parent); +} + +#ifdef CONFIG_HOTPLUG_CPU + +void arch_unregister_cpu(int num) +{ + struct node *parent = NULL; + +#ifdef CONFIG_NUMA + int node = cpu_to_node(num); + parent = &sysfs_nodes[node]; +#endif /* CONFIG_NUMA */ + + return unregister_cpu(&sysfs_cpus[num].cpu, parent); +} +EXPORT_SYMBOL(arch_register_cpu); +EXPORT_SYMBOL(arch_unregister_cpu); +#endif /*CONFIG_HOTPLUG_CPU*/ + + +static int __init topology_init(void) +{ + int i, err = 0; + +#ifdef CONFIG_NUMA + sysfs_nodes = kmalloc(sizeof(struct node) * MAX_NUMNODES, GFP_KERNEL); + if (!sysfs_nodes) { + err = -ENOMEM; + goto out; + } + memset(sysfs_nodes, 0, sizeof(struct node) * MAX_NUMNODES); + + /* MCD - Do we want to register all ONLINE nodes, or all POSSIBLE nodes? */ + for_each_online_node(i) + if ((err = register_node(&sysfs_nodes[i], i, 0))) + goto out; +#endif + + sysfs_cpus = kmalloc(sizeof(struct ia64_cpu) * NR_CPUS, GFP_KERNEL); + if (!sysfs_cpus) { + err = -ENOMEM; + goto out; + } + memset(sysfs_cpus, 0, sizeof(struct ia64_cpu) * NR_CPUS); + + for_each_present_cpu(i) + if((err = arch_register_cpu(i))) + goto out; +out: + return err; +} + +__initcall(topology_init); diff --git a/arch/ia64/kernel/traps.c b/arch/ia64/kernel/traps.c new file mode 100644 index 000000000000..e82ad78081b3 --- /dev/null +++ b/arch/ia64/kernel/traps.c @@ -0,0 +1,609 @@ +/* + * Architecture-specific trap handling. + * + * Copyright (C) 1998-2003 Hewlett-Packard Co + * David Mosberger-Tang <davidm@hpl.hp.com> + * + * 05/12/00 grao <goutham.rao@intel.com> : added isr in siginfo for SIGFPE + */ + +#include <linux/config.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/sched.h> +#include <linux/tty.h> +#include <linux/vt_kern.h> /* For unblank_screen() */ +#include <linux/module.h> /* for EXPORT_SYMBOL */ +#include <linux/hardirq.h> + +#include <asm/fpswa.h> +#include <asm/ia32.h> +#include <asm/intrinsics.h> +#include <asm/processor.h> +#include <asm/uaccess.h> + +extern spinlock_t timerlist_lock; + +fpswa_interface_t *fpswa_interface; +EXPORT_SYMBOL(fpswa_interface); + +void __init +trap_init (void) +{ + if (ia64_boot_param->fpswa) + /* FPSWA fixup: make the interface pointer a kernel virtual address: */ + fpswa_interface = __va(ia64_boot_param->fpswa); +} + +/* + * Unlock any spinlocks which will prevent us from getting the message out (timerlist_lock + * is acquired through the console unblank code) + */ +void +bust_spinlocks (int yes) +{ + int loglevel_save = console_loglevel; + + if (yes) { + oops_in_progress = 1; + return; + } + +#ifdef CONFIG_VT + unblank_screen(); +#endif + oops_in_progress = 0; + /* + * OK, the message is on the console. Now we call printk() without + * oops_in_progress set so that printk will give klogd a poke. Hold onto + * your hats... + */ + console_loglevel = 15; /* NMI oopser may have shut the console up */ + printk(" "); + console_loglevel = loglevel_save; +} + +void +die (const char *str, struct pt_regs *regs, long err) +{ + static struct { + spinlock_t lock; + u32 lock_owner; + int lock_owner_depth; + } die = { + .lock = SPIN_LOCK_UNLOCKED, + .lock_owner = -1, + .lock_owner_depth = 0 + }; + static int die_counter; + + if (die.lock_owner != smp_processor_id()) { + console_verbose(); + spin_lock_irq(&die.lock); + die.lock_owner = smp_processor_id(); + die.lock_owner_depth = 0; + bust_spinlocks(1); + } + + if (++die.lock_owner_depth < 3) { + printk("%s[%d]: %s %ld [%d]\n", + current->comm, current->pid, str, err, ++die_counter); + show_regs(regs); + } else + printk(KERN_ERR "Recursive die() failure, output suppressed\n"); + + bust_spinlocks(0); + die.lock_owner = -1; + spin_unlock_irq(&die.lock); + do_exit(SIGSEGV); +} + +void +die_if_kernel (char *str, struct pt_regs *regs, long err) +{ + if (!user_mode(regs)) + die(str, regs, err); +} + +void +ia64_bad_break (unsigned long break_num, struct pt_regs *regs) +{ + siginfo_t siginfo; + int sig, code; + + /* SIGILL, SIGFPE, SIGSEGV, and SIGBUS want these field initialized: */ + siginfo.si_addr = (void __user *) (regs->cr_iip + ia64_psr(regs)->ri); + siginfo.si_imm = break_num; + siginfo.si_flags = 0; /* clear __ISR_VALID */ + siginfo.si_isr = 0; + + switch (break_num) { + case 0: /* unknown error (used by GCC for __builtin_abort()) */ + die_if_kernel("bugcheck!", regs, break_num); + sig = SIGILL; code = ILL_ILLOPC; + break; + + case 1: /* integer divide by zero */ + sig = SIGFPE; code = FPE_INTDIV; + break; + + case 2: /* integer overflow */ + sig = SIGFPE; code = FPE_INTOVF; + break; + + case 3: /* range check/bounds check */ + sig = SIGFPE; code = FPE_FLTSUB; + break; + + case 4: /* null pointer dereference */ + sig = SIGSEGV; code = SEGV_MAPERR; + break; + + case 5: /* misaligned data */ + sig = SIGSEGV; code = BUS_ADRALN; + break; + + case 6: /* decimal overflow */ + sig = SIGFPE; code = __FPE_DECOVF; + break; + + case 7: /* decimal divide by zero */ + sig = SIGFPE; code = __FPE_DECDIV; + break; + + case 8: /* packed decimal error */ + sig = SIGFPE; code = __FPE_DECERR; + break; + + case 9: /* invalid ASCII digit */ + sig = SIGFPE; code = __FPE_INVASC; + break; + + case 10: /* invalid decimal digit */ + sig = SIGFPE; code = __FPE_INVDEC; + break; + + case 11: /* paragraph stack overflow */ + sig = SIGSEGV; code = __SEGV_PSTKOVF; + break; + + case 0x3f000 ... 0x3ffff: /* bundle-update in progress */ + sig = SIGILL; code = __ILL_BNDMOD; + break; + + default: + if (break_num < 0x40000 || break_num > 0x100000) + die_if_kernel("Bad break", regs, break_num); + + if (break_num < 0x80000) { + sig = SIGILL; code = __ILL_BREAK; + } else { + sig = SIGTRAP; code = TRAP_BRKPT; + } + } + siginfo.si_signo = sig; + siginfo.si_errno = 0; + siginfo.si_code = code; + force_sig_info(sig, &siginfo, current); +} + +/* + * disabled_fph_fault() is called when a user-level process attempts to access f32..f127 + * and it doesn't own the fp-high register partition. When this happens, we save the + * current fph partition in the task_struct of the fpu-owner (if necessary) and then load + * the fp-high partition of the current task (if necessary). Note that the kernel has + * access to fph by the time we get here, as the IVT's "Disabled FP-Register" handler takes + * care of clearing psr.dfh. + */ +static inline void +disabled_fph_fault (struct pt_regs *regs) +{ + struct ia64_psr *psr = ia64_psr(regs); + + /* first, grant user-level access to fph partition: */ + psr->dfh = 0; +#ifndef CONFIG_SMP + { + struct task_struct *fpu_owner + = (struct task_struct *)ia64_get_kr(IA64_KR_FPU_OWNER); + + if (ia64_is_local_fpu_owner(current)) + return; + + if (fpu_owner) + ia64_flush_fph(fpu_owner); + } +#endif /* !CONFIG_SMP */ + ia64_set_local_fpu_owner(current); + if ((current->thread.flags & IA64_THREAD_FPH_VALID) != 0) { + __ia64_load_fpu(current->thread.fph); + psr->mfh = 0; + } else { + __ia64_init_fpu(); + /* + * Set mfh because the state in thread.fph does not match the state in + * the fph partition. + */ + psr->mfh = 1; + } +} + +static inline int +fp_emulate (int fp_fault, void *bundle, long *ipsr, long *fpsr, long *isr, long *pr, long *ifs, + struct pt_regs *regs) +{ + fp_state_t fp_state; + fpswa_ret_t ret; + + if (!fpswa_interface) + return -1; + + memset(&fp_state, 0, sizeof(fp_state_t)); + + /* + * compute fp_state. only FP registers f6 - f11 are used by the + * kernel, so set those bits in the mask and set the low volatile + * pointer to point to these registers. + */ + fp_state.bitmask_low64 = 0xfc0; /* bit6..bit11 */ + + fp_state.fp_state_low_volatile = (fp_state_low_volatile_t *) ®s->f6; + /* + * unsigned long (*EFI_FPSWA) ( + * unsigned long trap_type, + * void *Bundle, + * unsigned long *pipsr, + * unsigned long *pfsr, + * unsigned long *pisr, + * unsigned long *ppreds, + * unsigned long *pifs, + * void *fp_state); + */ + ret = (*fpswa_interface->fpswa)((unsigned long) fp_fault, bundle, + (unsigned long *) ipsr, (unsigned long *) fpsr, + (unsigned long *) isr, (unsigned long *) pr, + (unsigned long *) ifs, &fp_state); + + return ret.status; +} + +/* + * Handle floating-point assist faults and traps. + */ +static int +handle_fpu_swa (int fp_fault, struct pt_regs *regs, unsigned long isr) +{ + long exception, bundle[2]; + unsigned long fault_ip; + struct siginfo siginfo; + static int fpu_swa_count = 0; + static unsigned long last_time; + + fault_ip = regs->cr_iip; + if (!fp_fault && (ia64_psr(regs)->ri == 0)) + fault_ip -= 16; + if (copy_from_user(bundle, (void __user *) fault_ip, sizeof(bundle))) + return -1; + + if (jiffies - last_time > 5*HZ) + fpu_swa_count = 0; + if ((fpu_swa_count < 4) && !(current->thread.flags & IA64_THREAD_FPEMU_NOPRINT)) { + last_time = jiffies; + ++fpu_swa_count; + printk(KERN_WARNING + "%s(%d): floating-point assist fault at ip %016lx, isr %016lx\n", + current->comm, current->pid, regs->cr_iip + ia64_psr(regs)->ri, isr); + } + + exception = fp_emulate(fp_fault, bundle, ®s->cr_ipsr, ®s->ar_fpsr, &isr, ®s->pr, + ®s->cr_ifs, regs); + if (fp_fault) { + if (exception == 0) { + /* emulation was successful */ + ia64_increment_ip(regs); + } else if (exception == -1) { + printk(KERN_ERR "handle_fpu_swa: fp_emulate() returned -1\n"); + return -1; + } else { + /* is next instruction a trap? */ + if (exception & 2) { + ia64_increment_ip(regs); + } + siginfo.si_signo = SIGFPE; + siginfo.si_errno = 0; + siginfo.si_code = __SI_FAULT; /* default code */ + siginfo.si_addr = (void __user *) (regs->cr_iip + ia64_psr(regs)->ri); + if (isr & 0x11) { + siginfo.si_code = FPE_FLTINV; + } else if (isr & 0x22) { + /* denormal operand gets the same si_code as underflow + * see arch/i386/kernel/traps.c:math_error() */ + siginfo.si_code = FPE_FLTUND; + } else if (isr & 0x44) { + siginfo.si_code = FPE_FLTDIV; + } + siginfo.si_isr = isr; + siginfo.si_flags = __ISR_VALID; + siginfo.si_imm = 0; + force_sig_info(SIGFPE, &siginfo, current); + } + } else { + if (exception == -1) { + printk(KERN_ERR "handle_fpu_swa: fp_emulate() returned -1\n"); + return -1; + } else if (exception != 0) { + /* raise exception */ + siginfo.si_signo = SIGFPE; + siginfo.si_errno = 0; + siginfo.si_code = __SI_FAULT; /* default code */ + siginfo.si_addr = (void __user *) (regs->cr_iip + ia64_psr(regs)->ri); + if (isr & 0x880) { + siginfo.si_code = FPE_FLTOVF; + } else if (isr & 0x1100) { + siginfo.si_code = FPE_FLTUND; + } else if (isr & 0x2200) { + siginfo.si_code = FPE_FLTRES; + } + siginfo.si_isr = isr; + siginfo.si_flags = __ISR_VALID; + siginfo.si_imm = 0; + force_sig_info(SIGFPE, &siginfo, current); + } + } + return 0; +} + +struct illegal_op_return { + unsigned long fkt, arg1, arg2, arg3; +}; + +struct illegal_op_return +ia64_illegal_op_fault (unsigned long ec, long arg1, long arg2, long arg3, + long arg4, long arg5, long arg6, long arg7, + struct pt_regs regs) +{ + struct illegal_op_return rv; + struct siginfo si; + char buf[128]; + +#ifdef CONFIG_IA64_BRL_EMU + { + extern struct illegal_op_return ia64_emulate_brl (struct pt_regs *, unsigned long); + + rv = ia64_emulate_brl(®s, ec); + if (rv.fkt != (unsigned long) -1) + return rv; + } +#endif + + sprintf(buf, "IA-64 Illegal operation fault"); + die_if_kernel(buf, ®s, 0); + + memset(&si, 0, sizeof(si)); + si.si_signo = SIGILL; + si.si_code = ILL_ILLOPC; + si.si_addr = (void __user *) (regs.cr_iip + ia64_psr(®s)->ri); + force_sig_info(SIGILL, &si, current); + rv.fkt = 0; + return rv; +} + +void +ia64_fault (unsigned long vector, unsigned long isr, unsigned long ifa, + unsigned long iim, unsigned long itir, long arg5, long arg6, + long arg7, struct pt_regs regs) +{ + unsigned long code, error = isr, iip; + struct siginfo siginfo; + char buf[128]; + int result, sig; + static const char *reason[] = { + "IA-64 Illegal Operation fault", + "IA-64 Privileged Operation fault", + "IA-64 Privileged Register fault", + "IA-64 Reserved Register/Field fault", + "Disabled Instruction Set Transition fault", + "Unknown fault 5", "Unknown fault 6", "Unknown fault 7", "Illegal Hazard fault", + "Unknown fault 9", "Unknown fault 10", "Unknown fault 11", "Unknown fault 12", + "Unknown fault 13", "Unknown fault 14", "Unknown fault 15" + }; + + if ((isr & IA64_ISR_NA) && ((isr & IA64_ISR_CODE_MASK) == IA64_ISR_CODE_LFETCH)) { + /* + * This fault was due to lfetch.fault, set "ed" bit in the psr to cancel + * the lfetch. + */ + ia64_psr(®s)->ed = 1; + return; + } + + iip = regs.cr_iip + ia64_psr(®s)->ri; + + switch (vector) { + case 24: /* General Exception */ + code = (isr >> 4) & 0xf; + sprintf(buf, "General Exception: %s%s", reason[code], + (code == 3) ? ((isr & (1UL << 37)) + ? " (RSE access)" : " (data access)") : ""); + if (code == 8) { +# ifdef CONFIG_IA64_PRINT_HAZARDS + printk("%s[%d]: possible hazard @ ip=%016lx (pr = %016lx)\n", + current->comm, current->pid, + regs.cr_iip + ia64_psr(®s)->ri, regs.pr); +# endif + return; + } + break; + + case 25: /* Disabled FP-Register */ + if (isr & 2) { + disabled_fph_fault(®s); + return; + } + sprintf(buf, "Disabled FPL fault---not supposed to happen!"); + break; + + case 26: /* NaT Consumption */ + if (user_mode(®s)) { + void __user *addr; + + if (((isr >> 4) & 0xf) == 2) { + /* NaT page consumption */ + sig = SIGSEGV; + code = SEGV_ACCERR; + addr = (void __user *) ifa; + } else { + /* register NaT consumption */ + sig = SIGILL; + code = ILL_ILLOPN; + addr = (void __user *) (regs.cr_iip + + ia64_psr(®s)->ri); + } + siginfo.si_signo = sig; + siginfo.si_code = code; + siginfo.si_errno = 0; + siginfo.si_addr = addr; + siginfo.si_imm = vector; + siginfo.si_flags = __ISR_VALID; + siginfo.si_isr = isr; + force_sig_info(sig, &siginfo, current); + return; + } else if (ia64_done_with_exception(®s)) + return; + sprintf(buf, "NaT consumption"); + break; + + case 31: /* Unsupported Data Reference */ + if (user_mode(®s)) { + siginfo.si_signo = SIGILL; + siginfo.si_code = ILL_ILLOPN; + siginfo.si_errno = 0; + siginfo.si_addr = (void __user *) iip; + siginfo.si_imm = vector; + siginfo.si_flags = __ISR_VALID; + siginfo.si_isr = isr; + force_sig_info(SIGILL, &siginfo, current); + return; + } + sprintf(buf, "Unsupported data reference"); + break; + + case 29: /* Debug */ + case 35: /* Taken Branch Trap */ + case 36: /* Single Step Trap */ + if (fsys_mode(current, ®s)) { + extern char __kernel_syscall_via_break[]; + /* + * Got a trap in fsys-mode: Taken Branch Trap and Single Step trap + * need special handling; Debug trap is not supposed to happen. + */ + if (unlikely(vector == 29)) { + die("Got debug trap in fsys-mode---not supposed to happen!", + ®s, 0); + return; + } + /* re-do the system call via break 0x100000: */ + regs.cr_iip = (unsigned long) __kernel_syscall_via_break; + ia64_psr(®s)->ri = 0; + ia64_psr(®s)->cpl = 3; + return; + } + switch (vector) { + case 29: + siginfo.si_code = TRAP_HWBKPT; +#ifdef CONFIG_ITANIUM + /* + * Erratum 10 (IFA may contain incorrect address) now has + * "NoFix" status. There are no plans for fixing this. + */ + if (ia64_psr(®s)->is == 0) + ifa = regs.cr_iip; +#endif + break; + case 35: siginfo.si_code = TRAP_BRANCH; ifa = 0; break; + case 36: siginfo.si_code = TRAP_TRACE; ifa = 0; break; + } + siginfo.si_signo = SIGTRAP; + siginfo.si_errno = 0; + siginfo.si_addr = (void __user *) ifa; + siginfo.si_imm = 0; + siginfo.si_flags = __ISR_VALID; + siginfo.si_isr = isr; + force_sig_info(SIGTRAP, &siginfo, current); + return; + + case 32: /* fp fault */ + case 33: /* fp trap */ + result = handle_fpu_swa((vector == 32) ? 1 : 0, ®s, isr); + if ((result < 0) || (current->thread.flags & IA64_THREAD_FPEMU_SIGFPE)) { + siginfo.si_signo = SIGFPE; + siginfo.si_errno = 0; + siginfo.si_code = FPE_FLTINV; + siginfo.si_addr = (void __user *) iip; + siginfo.si_flags = __ISR_VALID; + siginfo.si_isr = isr; + siginfo.si_imm = 0; + force_sig_info(SIGFPE, &siginfo, current); + } + return; + + case 34: + if (isr & 0x2) { + /* Lower-Privilege Transfer Trap */ + /* + * Just clear PSR.lp and then return immediately: all the + * interesting work (e.g., signal delivery is done in the kernel + * exit path). + */ + ia64_psr(®s)->lp = 0; + return; + } else { + /* Unimplemented Instr. Address Trap */ + if (user_mode(®s)) { + siginfo.si_signo = SIGILL; + siginfo.si_code = ILL_BADIADDR; + siginfo.si_errno = 0; + siginfo.si_flags = 0; + siginfo.si_isr = 0; + siginfo.si_imm = 0; + siginfo.si_addr = (void __user *) iip; + force_sig_info(SIGILL, &siginfo, current); + return; + } + sprintf(buf, "Unimplemented Instruction Address fault"); + } + break; + + case 45: +#ifdef CONFIG_IA32_SUPPORT + if (ia32_exception(®s, isr) == 0) + return; +#endif + printk(KERN_ERR "Unexpected IA-32 exception (Trap 45)\n"); + printk(KERN_ERR " iip - 0x%lx, ifa - 0x%lx, isr - 0x%lx\n", + iip, ifa, isr); + force_sig(SIGSEGV, current); + break; + + case 46: +#ifdef CONFIG_IA32_SUPPORT + if (ia32_intercept(®s, isr) == 0) + return; +#endif + printk(KERN_ERR "Unexpected IA-32 intercept trap (Trap 46)\n"); + printk(KERN_ERR " iip - 0x%lx, ifa - 0x%lx, isr - 0x%lx, iim - 0x%lx\n", + iip, ifa, isr, iim); + force_sig(SIGSEGV, current); + return; + + case 47: + sprintf(buf, "IA-32 Interruption Fault (int 0x%lx)", isr >> 16); + break; + + default: + sprintf(buf, "Fault %lu", vector); + break; + } + die_if_kernel(buf, ®s, error); + force_sig(SIGILL, current); +} diff --git a/arch/ia64/kernel/unaligned.c b/arch/ia64/kernel/unaligned.c new file mode 100644 index 000000000000..43b45b65ee5a --- /dev/null +++ b/arch/ia64/kernel/unaligned.c @@ -0,0 +1,1521 @@ +/* + * Architecture-specific unaligned trap handling. + * + * Copyright (C) 1999-2002, 2004 Hewlett-Packard Co + * Stephane Eranian <eranian@hpl.hp.com> + * David Mosberger-Tang <davidm@hpl.hp.com> + * + * 2002/12/09 Fix rotating register handling (off-by-1 error, missing fr-rotation). Fix + * get_rse_reg() to not leak kernel bits to user-level (reading an out-of-frame + * stacked register returns an undefined value; it does NOT trigger a + * "rsvd register fault"). + * 2001/10/11 Fix unaligned access to rotating registers in s/w pipelined loops. + * 2001/08/13 Correct size of extended floats (float_fsz) from 16 to 10 bytes. + * 2001/01/17 Add support emulation of unaligned kernel accesses. + */ +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/smp_lock.h> +#include <linux/tty.h> + +#include <asm/intrinsics.h> +#include <asm/processor.h> +#include <asm/rse.h> +#include <asm/uaccess.h> +#include <asm/unaligned.h> + +extern void die_if_kernel(char *str, struct pt_regs *regs, long err) __attribute__ ((noreturn)); + +#undef DEBUG_UNALIGNED_TRAP + +#ifdef DEBUG_UNALIGNED_TRAP +# define DPRINT(a...) do { printk("%s %u: ", __FUNCTION__, __LINE__); printk (a); } while (0) +# define DDUMP(str,vp,len) dump(str, vp, len) + +static void +dump (const char *str, void *vp, size_t len) +{ + unsigned char *cp = vp; + int i; + + printk("%s", str); + for (i = 0; i < len; ++i) + printk (" %02x", *cp++); + printk("\n"); +} +#else +# define DPRINT(a...) +# define DDUMP(str,vp,len) +#endif + +#define IA64_FIRST_STACKED_GR 32 +#define IA64_FIRST_ROTATING_FR 32 +#define SIGN_EXT9 0xffffffffffffff00ul + +/* + * For M-unit: + * + * opcode | m | x6 | + * --------|------|---------| + * [40-37] | [36] | [35:30] | + * --------|------|---------| + * 4 | 1 | 6 | = 11 bits + * -------------------------- + * However bits [31:30] are not directly useful to distinguish between + * load/store so we can use [35:32] instead, which gives the following + * mask ([40:32]) using 9 bits. The 'e' comes from the fact that we defer + * checking the m-bit until later in the load/store emulation. + */ +#define IA64_OPCODE_MASK 0x1ef +#define IA64_OPCODE_SHIFT 32 + +/* + * Table C-28 Integer Load/Store + * + * We ignore [35:32]= 0x6, 0x7, 0xE, 0xF + * + * ld8.fill, st8.fill MUST be aligned because the RNATs are based on + * the address (bits [8:3]), so we must failed. + */ +#define LD_OP 0x080 +#define LDS_OP 0x081 +#define LDA_OP 0x082 +#define LDSA_OP 0x083 +#define LDBIAS_OP 0x084 +#define LDACQ_OP 0x085 +/* 0x086, 0x087 are not relevant */ +#define LDCCLR_OP 0x088 +#define LDCNC_OP 0x089 +#define LDCCLRACQ_OP 0x08a +#define ST_OP 0x08c +#define STREL_OP 0x08d +/* 0x08e,0x8f are not relevant */ + +/* + * Table C-29 Integer Load +Reg + * + * we use the ld->m (bit [36:36]) field to determine whether or not we have + * a load/store of this form. + */ + +/* + * Table C-30 Integer Load/Store +Imm + * + * We ignore [35:32]= 0x6, 0x7, 0xE, 0xF + * + * ld8.fill, st8.fill must be aligned because the Nat register are based on + * the address, so we must fail and the program must be fixed. + */ +#define LD_IMM_OP 0x0a0 +#define LDS_IMM_OP 0x0a1 +#define LDA_IMM_OP 0x0a2 +#define LDSA_IMM_OP 0x0a3 +#define LDBIAS_IMM_OP 0x0a4 +#define LDACQ_IMM_OP 0x0a5 +/* 0x0a6, 0xa7 are not relevant */ +#define LDCCLR_IMM_OP 0x0a8 +#define LDCNC_IMM_OP 0x0a9 +#define LDCCLRACQ_IMM_OP 0x0aa +#define ST_IMM_OP 0x0ac +#define STREL_IMM_OP 0x0ad +/* 0x0ae,0xaf are not relevant */ + +/* + * Table C-32 Floating-point Load/Store + */ +#define LDF_OP 0x0c0 +#define LDFS_OP 0x0c1 +#define LDFA_OP 0x0c2 +#define LDFSA_OP 0x0c3 +/* 0x0c6 is irrelevant */ +#define LDFCCLR_OP 0x0c8 +#define LDFCNC_OP 0x0c9 +/* 0x0cb is irrelevant */ +#define STF_OP 0x0cc + +/* + * Table C-33 Floating-point Load +Reg + * + * we use the ld->m (bit [36:36]) field to determine whether or not we have + * a load/store of this form. + */ + +/* + * Table C-34 Floating-point Load/Store +Imm + */ +#define LDF_IMM_OP 0x0e0 +#define LDFS_IMM_OP 0x0e1 +#define LDFA_IMM_OP 0x0e2 +#define LDFSA_IMM_OP 0x0e3 +/* 0x0e6 is irrelevant */ +#define LDFCCLR_IMM_OP 0x0e8 +#define LDFCNC_IMM_OP 0x0e9 +#define STF_IMM_OP 0x0ec + +typedef struct { + unsigned long qp:6; /* [0:5] */ + unsigned long r1:7; /* [6:12] */ + unsigned long imm:7; /* [13:19] */ + unsigned long r3:7; /* [20:26] */ + unsigned long x:1; /* [27:27] */ + unsigned long hint:2; /* [28:29] */ + unsigned long x6_sz:2; /* [30:31] */ + unsigned long x6_op:4; /* [32:35], x6 = x6_sz|x6_op */ + unsigned long m:1; /* [36:36] */ + unsigned long op:4; /* [37:40] */ + unsigned long pad:23; /* [41:63] */ +} load_store_t; + + +typedef enum { + UPD_IMMEDIATE, /* ldXZ r1=[r3],imm(9) */ + UPD_REG /* ldXZ r1=[r3],r2 */ +} update_t; + +/* + * We use tables to keep track of the offsets of registers in the saved state. + * This way we save having big switch/case statements. + * + * We use bit 0 to indicate switch_stack or pt_regs. + * The offset is simply shifted by 1 bit. + * A 2-byte value should be enough to hold any kind of offset + * + * In case the calling convention changes (and thus pt_regs/switch_stack) + * simply use RSW instead of RPT or vice-versa. + */ + +#define RPO(x) ((size_t) &((struct pt_regs *)0)->x) +#define RSO(x) ((size_t) &((struct switch_stack *)0)->x) + +#define RPT(x) (RPO(x) << 1) +#define RSW(x) (1| RSO(x)<<1) + +#define GR_OFFS(x) (gr_info[x]>>1) +#define GR_IN_SW(x) (gr_info[x] & 0x1) + +#define FR_OFFS(x) (fr_info[x]>>1) +#define FR_IN_SW(x) (fr_info[x] & 0x1) + +static u16 gr_info[32]={ + 0, /* r0 is read-only : WE SHOULD NEVER GET THIS */ + + RPT(r1), RPT(r2), RPT(r3), + + RSW(r4), RSW(r5), RSW(r6), RSW(r7), + + RPT(r8), RPT(r9), RPT(r10), RPT(r11), + RPT(r12), RPT(r13), RPT(r14), RPT(r15), + + RPT(r16), RPT(r17), RPT(r18), RPT(r19), + RPT(r20), RPT(r21), RPT(r22), RPT(r23), + RPT(r24), RPT(r25), RPT(r26), RPT(r27), + RPT(r28), RPT(r29), RPT(r30), RPT(r31) +}; + +static u16 fr_info[32]={ + 0, /* constant : WE SHOULD NEVER GET THIS */ + 0, /* constant : WE SHOULD NEVER GET THIS */ + + RSW(f2), RSW(f3), RSW(f4), RSW(f5), + + RPT(f6), RPT(f7), RPT(f8), RPT(f9), + RPT(f10), RPT(f11), + + RSW(f12), RSW(f13), RSW(f14), + RSW(f15), RSW(f16), RSW(f17), RSW(f18), RSW(f19), + RSW(f20), RSW(f21), RSW(f22), RSW(f23), RSW(f24), + RSW(f25), RSW(f26), RSW(f27), RSW(f28), RSW(f29), + RSW(f30), RSW(f31) +}; + +/* Invalidate ALAT entry for integer register REGNO. */ +static void +invala_gr (int regno) +{ +# define F(reg) case reg: ia64_invala_gr(reg); break + + switch (regno) { + F( 0); F( 1); F( 2); F( 3); F( 4); F( 5); F( 6); F( 7); + F( 8); F( 9); F( 10); F( 11); F( 12); F( 13); F( 14); F( 15); + F( 16); F( 17); F( 18); F( 19); F( 20); F( 21); F( 22); F( 23); + F( 24); F( 25); F( 26); F( 27); F( 28); F( 29); F( 30); F( 31); + F( 32); F( 33); F( 34); F( 35); F( 36); F( 37); F( 38); F( 39); + F( 40); F( 41); F( 42); F( 43); F( 44); F( 45); F( 46); F( 47); + F( 48); F( 49); F( 50); F( 51); F( 52); F( 53); F( 54); F( 55); + F( 56); F( 57); F( 58); F( 59); F( 60); F( 61); F( 62); F( 63); + F( 64); F( 65); F( 66); F( 67); F( 68); F( 69); F( 70); F( 71); + F( 72); F( 73); F( 74); F( 75); F( 76); F( 77); F( 78); F( 79); + F( 80); F( 81); F( 82); F( 83); F( 84); F( 85); F( 86); F( 87); + F( 88); F( 89); F( 90); F( 91); F( 92); F( 93); F( 94); F( 95); + F( 96); F( 97); F( 98); F( 99); F(100); F(101); F(102); F(103); + F(104); F(105); F(106); F(107); F(108); F(109); F(110); F(111); + F(112); F(113); F(114); F(115); F(116); F(117); F(118); F(119); + F(120); F(121); F(122); F(123); F(124); F(125); F(126); F(127); + } +# undef F +} + +/* Invalidate ALAT entry for floating-point register REGNO. */ +static void +invala_fr (int regno) +{ +# define F(reg) case reg: ia64_invala_fr(reg); break + + switch (regno) { + F( 0); F( 1); F( 2); F( 3); F( 4); F( 5); F( 6); F( 7); + F( 8); F( 9); F( 10); F( 11); F( 12); F( 13); F( 14); F( 15); + F( 16); F( 17); F( 18); F( 19); F( 20); F( 21); F( 22); F( 23); + F( 24); F( 25); F( 26); F( 27); F( 28); F( 29); F( 30); F( 31); + F( 32); F( 33); F( 34); F( 35); F( 36); F( 37); F( 38); F( 39); + F( 40); F( 41); F( 42); F( 43); F( 44); F( 45); F( 46); F( 47); + F( 48); F( 49); F( 50); F( 51); F( 52); F( 53); F( 54); F( 55); + F( 56); F( 57); F( 58); F( 59); F( 60); F( 61); F( 62); F( 63); + F( 64); F( 65); F( 66); F( 67); F( 68); F( 69); F( 70); F( 71); + F( 72); F( 73); F( 74); F( 75); F( 76); F( 77); F( 78); F( 79); + F( 80); F( 81); F( 82); F( 83); F( 84); F( 85); F( 86); F( 87); + F( 88); F( 89); F( 90); F( 91); F( 92); F( 93); F( 94); F( 95); + F( 96); F( 97); F( 98); F( 99); F(100); F(101); F(102); F(103); + F(104); F(105); F(106); F(107); F(108); F(109); F(110); F(111); + F(112); F(113); F(114); F(115); F(116); F(117); F(118); F(119); + F(120); F(121); F(122); F(123); F(124); F(125); F(126); F(127); + } +# undef F +} + +static inline unsigned long +rotate_reg (unsigned long sor, unsigned long rrb, unsigned long reg) +{ + reg += rrb; + if (reg >= sor) + reg -= sor; + return reg; +} + +static void +set_rse_reg (struct pt_regs *regs, unsigned long r1, unsigned long val, int nat) +{ + struct switch_stack *sw = (struct switch_stack *) regs - 1; + unsigned long *bsp, *bspstore, *addr, *rnat_addr, *ubs_end; + unsigned long *kbs = (void *) current + IA64_RBS_OFFSET; + unsigned long rnats, nat_mask; + unsigned long on_kbs; + long sof = (regs->cr_ifs) & 0x7f; + long sor = 8 * ((regs->cr_ifs >> 14) & 0xf); + long rrb_gr = (regs->cr_ifs >> 18) & 0x7f; + long ridx = r1 - 32; + + if (ridx >= sof) { + /* this should never happen, as the "rsvd register fault" has higher priority */ + DPRINT("ignoring write to r%lu; only %lu registers are allocated!\n", r1, sof); + return; + } + + if (ridx < sor) + ridx = rotate_reg(sor, rrb_gr, ridx); + + DPRINT("r%lu, sw.bspstore=%lx pt.bspstore=%lx sof=%ld sol=%ld ridx=%ld\n", + r1, sw->ar_bspstore, regs->ar_bspstore, sof, (regs->cr_ifs >> 7) & 0x7f, ridx); + + on_kbs = ia64_rse_num_regs(kbs, (unsigned long *) sw->ar_bspstore); + addr = ia64_rse_skip_regs((unsigned long *) sw->ar_bspstore, -sof + ridx); + if (addr >= kbs) { + /* the register is on the kernel backing store: easy... */ + rnat_addr = ia64_rse_rnat_addr(addr); + if ((unsigned long) rnat_addr >= sw->ar_bspstore) + rnat_addr = &sw->ar_rnat; + nat_mask = 1UL << ia64_rse_slot_num(addr); + + *addr = val; + if (nat) + *rnat_addr |= nat_mask; + else + *rnat_addr &= ~nat_mask; + return; + } + + if (!user_stack(current, regs)) { + DPRINT("ignoring kernel write to r%lu; register isn't on the kernel RBS!", r1); + return; + } + + bspstore = (unsigned long *)regs->ar_bspstore; + ubs_end = ia64_rse_skip_regs(bspstore, on_kbs); + bsp = ia64_rse_skip_regs(ubs_end, -sof); + addr = ia64_rse_skip_regs(bsp, ridx); + + DPRINT("ubs_end=%p bsp=%p addr=%p\n", (void *) ubs_end, (void *) bsp, (void *) addr); + + ia64_poke(current, sw, (unsigned long) ubs_end, (unsigned long) addr, val); + + rnat_addr = ia64_rse_rnat_addr(addr); + + ia64_peek(current, sw, (unsigned long) ubs_end, (unsigned long) rnat_addr, &rnats); + DPRINT("rnat @%p = 0x%lx nat=%d old nat=%ld\n", + (void *) rnat_addr, rnats, nat, (rnats >> ia64_rse_slot_num(addr)) & 1); + + nat_mask = 1UL << ia64_rse_slot_num(addr); + if (nat) + rnats |= nat_mask; + else + rnats &= ~nat_mask; + ia64_poke(current, sw, (unsigned long) ubs_end, (unsigned long) rnat_addr, rnats); + + DPRINT("rnat changed to @%p = 0x%lx\n", (void *) rnat_addr, rnats); +} + + +static void +get_rse_reg (struct pt_regs *regs, unsigned long r1, unsigned long *val, int *nat) +{ + struct switch_stack *sw = (struct switch_stack *) regs - 1; + unsigned long *bsp, *addr, *rnat_addr, *ubs_end, *bspstore; + unsigned long *kbs = (void *) current + IA64_RBS_OFFSET; + unsigned long rnats, nat_mask; + unsigned long on_kbs; + long sof = (regs->cr_ifs) & 0x7f; + long sor = 8 * ((regs->cr_ifs >> 14) & 0xf); + long rrb_gr = (regs->cr_ifs >> 18) & 0x7f; + long ridx = r1 - 32; + + if (ridx >= sof) { + /* read of out-of-frame register returns an undefined value; 0 in our case. */ + DPRINT("ignoring read from r%lu; only %lu registers are allocated!\n", r1, sof); + goto fail; + } + + if (ridx < sor) + ridx = rotate_reg(sor, rrb_gr, ridx); + + DPRINT("r%lu, sw.bspstore=%lx pt.bspstore=%lx sof=%ld sol=%ld ridx=%ld\n", + r1, sw->ar_bspstore, regs->ar_bspstore, sof, (regs->cr_ifs >> 7) & 0x7f, ridx); + + on_kbs = ia64_rse_num_regs(kbs, (unsigned long *) sw->ar_bspstore); + addr = ia64_rse_skip_regs((unsigned long *) sw->ar_bspstore, -sof + ridx); + if (addr >= kbs) { + /* the register is on the kernel backing store: easy... */ + *val = *addr; + if (nat) { + rnat_addr = ia64_rse_rnat_addr(addr); + if ((unsigned long) rnat_addr >= sw->ar_bspstore) + rnat_addr = &sw->ar_rnat; + nat_mask = 1UL << ia64_rse_slot_num(addr); + *nat = (*rnat_addr & nat_mask) != 0; + } + return; + } + + if (!user_stack(current, regs)) { + DPRINT("ignoring kernel read of r%lu; register isn't on the RBS!", r1); + goto fail; + } + + bspstore = (unsigned long *)regs->ar_bspstore; + ubs_end = ia64_rse_skip_regs(bspstore, on_kbs); + bsp = ia64_rse_skip_regs(ubs_end, -sof); + addr = ia64_rse_skip_regs(bsp, ridx); + + DPRINT("ubs_end=%p bsp=%p addr=%p\n", (void *) ubs_end, (void *) bsp, (void *) addr); + + ia64_peek(current, sw, (unsigned long) ubs_end, (unsigned long) addr, val); + + if (nat) { + rnat_addr = ia64_rse_rnat_addr(addr); + nat_mask = 1UL << ia64_rse_slot_num(addr); + + DPRINT("rnat @%p = 0x%lx\n", (void *) rnat_addr, rnats); + + ia64_peek(current, sw, (unsigned long) ubs_end, (unsigned long) rnat_addr, &rnats); + *nat = (rnats & nat_mask) != 0; + } + return; + + fail: + *val = 0; + if (nat) + *nat = 0; + return; +} + + +static void +setreg (unsigned long regnum, unsigned long val, int nat, struct pt_regs *regs) +{ + struct switch_stack *sw = (struct switch_stack *) regs - 1; + unsigned long addr; + unsigned long bitmask; + unsigned long *unat; + + /* + * First takes care of stacked registers + */ + if (regnum >= IA64_FIRST_STACKED_GR) { + set_rse_reg(regs, regnum, val, nat); + return; + } + + /* + * Using r0 as a target raises a General Exception fault which has higher priority + * than the Unaligned Reference fault. + */ + + /* + * Now look at registers in [0-31] range and init correct UNAT + */ + if (GR_IN_SW(regnum)) { + addr = (unsigned long)sw; + unat = &sw->ar_unat; + } else { + addr = (unsigned long)regs; + unat = &sw->caller_unat; + } + DPRINT("tmp_base=%lx switch_stack=%s offset=%d\n", + addr, unat==&sw->ar_unat ? "yes":"no", GR_OFFS(regnum)); + /* + * add offset from base of struct + * and do it ! + */ + addr += GR_OFFS(regnum); + + *(unsigned long *)addr = val; + + /* + * We need to clear the corresponding UNAT bit to fully emulate the load + * UNAT bit_pos = GR[r3]{8:3} form EAS-2.4 + */ + bitmask = 1UL << (addr >> 3 & 0x3f); + DPRINT("*0x%lx=0x%lx NaT=%d prev_unat @%p=%lx\n", addr, val, nat, (void *) unat, *unat); + if (nat) { + *unat |= bitmask; + } else { + *unat &= ~bitmask; + } + DPRINT("*0x%lx=0x%lx NaT=%d new unat: %p=%lx\n", addr, val, nat, (void *) unat,*unat); +} + +/* + * Return the (rotated) index for floating point register REGNUM (REGNUM must be in the + * range from 32-127, result is in the range from 0-95. + */ +static inline unsigned long +fph_index (struct pt_regs *regs, long regnum) +{ + unsigned long rrb_fr = (regs->cr_ifs >> 25) & 0x7f; + return rotate_reg(96, rrb_fr, (regnum - IA64_FIRST_ROTATING_FR)); +} + +static void +setfpreg (unsigned long regnum, struct ia64_fpreg *fpval, struct pt_regs *regs) +{ + struct switch_stack *sw = (struct switch_stack *)regs - 1; + unsigned long addr; + + /* + * From EAS-2.5: FPDisableFault has higher priority than Unaligned + * Fault. Thus, when we get here, we know the partition is enabled. + * To update f32-f127, there are three choices: + * + * (1) save f32-f127 to thread.fph and update the values there + * (2) use a gigantic switch statement to directly access the registers + * (3) generate code on the fly to update the desired register + * + * For now, we are using approach (1). + */ + if (regnum >= IA64_FIRST_ROTATING_FR) { + ia64_sync_fph(current); + current->thread.fph[fph_index(regs, regnum)] = *fpval; + } else { + /* + * pt_regs or switch_stack ? + */ + if (FR_IN_SW(regnum)) { + addr = (unsigned long)sw; + } else { + addr = (unsigned long)regs; + } + + DPRINT("tmp_base=%lx offset=%d\n", addr, FR_OFFS(regnum)); + + addr += FR_OFFS(regnum); + *(struct ia64_fpreg *)addr = *fpval; + + /* + * mark the low partition as being used now + * + * It is highly unlikely that this bit is not already set, but + * let's do it for safety. + */ + regs->cr_ipsr |= IA64_PSR_MFL; + } +} + +/* + * Those 2 inline functions generate the spilled versions of the constant floating point + * registers which can be used with stfX + */ +static inline void +float_spill_f0 (struct ia64_fpreg *final) +{ + ia64_stf_spill(final, 0); +} + +static inline void +float_spill_f1 (struct ia64_fpreg *final) +{ + ia64_stf_spill(final, 1); +} + +static void +getfpreg (unsigned long regnum, struct ia64_fpreg *fpval, struct pt_regs *regs) +{ + struct switch_stack *sw = (struct switch_stack *) regs - 1; + unsigned long addr; + + /* + * From EAS-2.5: FPDisableFault has higher priority than + * Unaligned Fault. Thus, when we get here, we know the partition is + * enabled. + * + * When regnum > 31, the register is still live and we need to force a save + * to current->thread.fph to get access to it. See discussion in setfpreg() + * for reasons and other ways of doing this. + */ + if (regnum >= IA64_FIRST_ROTATING_FR) { + ia64_flush_fph(current); + *fpval = current->thread.fph[fph_index(regs, regnum)]; + } else { + /* + * f0 = 0.0, f1= 1.0. Those registers are constant and are thus + * not saved, we must generate their spilled form on the fly + */ + switch(regnum) { + case 0: + float_spill_f0(fpval); + break; + case 1: + float_spill_f1(fpval); + break; + default: + /* + * pt_regs or switch_stack ? + */ + addr = FR_IN_SW(regnum) ? (unsigned long)sw + : (unsigned long)regs; + + DPRINT("is_sw=%d tmp_base=%lx offset=0x%x\n", + FR_IN_SW(regnum), addr, FR_OFFS(regnum)); + + addr += FR_OFFS(regnum); + *fpval = *(struct ia64_fpreg *)addr; + } + } +} + + +static void +getreg (unsigned long regnum, unsigned long *val, int *nat, struct pt_regs *regs) +{ + struct switch_stack *sw = (struct switch_stack *) regs - 1; + unsigned long addr, *unat; + + if (regnum >= IA64_FIRST_STACKED_GR) { + get_rse_reg(regs, regnum, val, nat); + return; + } + + /* + * take care of r0 (read-only always evaluate to 0) + */ + if (regnum == 0) { + *val = 0; + if (nat) + *nat = 0; + return; + } + + /* + * Now look at registers in [0-31] range and init correct UNAT + */ + if (GR_IN_SW(regnum)) { + addr = (unsigned long)sw; + unat = &sw->ar_unat; + } else { + addr = (unsigned long)regs; + unat = &sw->caller_unat; + } + + DPRINT("addr_base=%lx offset=0x%x\n", addr, GR_OFFS(regnum)); + + addr += GR_OFFS(regnum); + + *val = *(unsigned long *)addr; + + /* + * do it only when requested + */ + if (nat) + *nat = (*unat >> (addr >> 3 & 0x3f)) & 0x1UL; +} + +static void +emulate_load_updates (update_t type, load_store_t ld, struct pt_regs *regs, unsigned long ifa) +{ + /* + * IMPORTANT: + * Given the way we handle unaligned speculative loads, we should + * not get to this point in the code but we keep this sanity check, + * just in case. + */ + if (ld.x6_op == 1 || ld.x6_op == 3) { + printk(KERN_ERR "%s: register update on speculative load, error\n", __FUNCTION__); + die_if_kernel("unaligned reference on speculative load with register update\n", + regs, 30); + } + + + /* + * at this point, we know that the base register to update is valid i.e., + * it's not r0 + */ + if (type == UPD_IMMEDIATE) { + unsigned long imm; + + /* + * Load +Imm: ldXZ r1=[r3],imm(9) + * + * + * form imm9: [13:19] contain the first 7 bits + */ + imm = ld.x << 7 | ld.imm; + + /* + * sign extend (1+8bits) if m set + */ + if (ld.m) imm |= SIGN_EXT9; + + /* + * ifa == r3 and we know that the NaT bit on r3 was clear so + * we can directly use ifa. + */ + ifa += imm; + + setreg(ld.r3, ifa, 0, regs); + + DPRINT("ld.x=%d ld.m=%d imm=%ld r3=0x%lx\n", ld.x, ld.m, imm, ifa); + + } else if (ld.m) { + unsigned long r2; + int nat_r2; + + /* + * Load +Reg Opcode: ldXZ r1=[r3],r2 + * + * Note: that we update r3 even in the case of ldfX.a + * (where the load does not happen) + * + * The way the load algorithm works, we know that r3 does not + * have its NaT bit set (would have gotten NaT consumption + * before getting the unaligned fault). So we can use ifa + * which equals r3 at this point. + * + * IMPORTANT: + * The above statement holds ONLY because we know that we + * never reach this code when trying to do a ldX.s. + * If we ever make it to here on an ldfX.s then + */ + getreg(ld.imm, &r2, &nat_r2, regs); + + ifa += r2; + + /* + * propagate Nat r2 -> r3 + */ + setreg(ld.r3, ifa, nat_r2, regs); + + DPRINT("imm=%d r2=%ld r3=0x%lx nat_r2=%d\n",ld.imm, r2, ifa, nat_r2); + } +} + + +static int +emulate_load_int (unsigned long ifa, load_store_t ld, struct pt_regs *regs) +{ + unsigned int len = 1 << ld.x6_sz; + unsigned long val = 0; + + /* + * r0, as target, doesn't need to be checked because Illegal Instruction + * faults have higher priority than unaligned faults. + * + * r0 cannot be found as the base as it would never generate an + * unaligned reference. + */ + + /* + * ldX.a we will emulate load and also invalidate the ALAT entry. + * See comment below for explanation on how we handle ldX.a + */ + + if (len != 2 && len != 4 && len != 8) { + DPRINT("unknown size: x6=%d\n", ld.x6_sz); + return -1; + } + /* this assumes little-endian byte-order: */ + if (copy_from_user(&val, (void __user *) ifa, len)) + return -1; + setreg(ld.r1, val, 0, regs); + + /* + * check for updates on any kind of loads + */ + if (ld.op == 0x5 || ld.m) + emulate_load_updates(ld.op == 0x5 ? UPD_IMMEDIATE: UPD_REG, ld, regs, ifa); + + /* + * handling of various loads (based on EAS2.4): + * + * ldX.acq (ordered load): + * - acquire semantics would have been used, so force fence instead. + * + * ldX.c.clr (check load and clear): + * - if we get to this handler, it's because the entry was not in the ALAT. + * Therefore the operation reverts to a normal load + * + * ldX.c.nc (check load no clear): + * - same as previous one + * + * ldX.c.clr.acq (ordered check load and clear): + * - same as above for c.clr part. The load needs to have acquire semantics. So + * we use the fence semantics which is stronger and thus ensures correctness. + * + * ldX.a (advanced load): + * - suppose ldX.a r1=[r3]. If we get to the unaligned trap it's because the + * address doesn't match requested size alignment. This means that we would + * possibly need more than one load to get the result. + * + * The load part can be handled just like a normal load, however the difficult + * part is to get the right thing into the ALAT. The critical piece of information + * in the base address of the load & size. To do that, a ld.a must be executed, + * clearly any address can be pushed into the table by using ld1.a r1=[r3]. Now + * if we use the same target register, we will be okay for the check.a instruction. + * If we look at the store, basically a stX [r3]=r1 checks the ALAT for any entry + * which would overlap within [r3,r3+X] (the size of the load was store in the + * ALAT). If such an entry is found the entry is invalidated. But this is not good + * enough, take the following example: + * r3=3 + * ld4.a r1=[r3] + * + * Could be emulated by doing: + * ld1.a r1=[r3],1 + * store to temporary; + * ld1.a r1=[r3],1 + * store & shift to temporary; + * ld1.a r1=[r3],1 + * store & shift to temporary; + * ld1.a r1=[r3] + * store & shift to temporary; + * r1=temporary + * + * So in this case, you would get the right value is r1 but the wrong info in + * the ALAT. Notice that you could do it in reverse to finish with address 3 + * but you would still get the size wrong. To get the size right, one needs to + * execute exactly the same kind of load. You could do it from a aligned + * temporary location, but you would get the address wrong. + * + * So no matter what, it is not possible to emulate an advanced load + * correctly. But is that really critical ? + * + * We will always convert ld.a into a normal load with ALAT invalidated. This + * will enable compiler to do optimization where certain code path after ld.a + * is not required to have ld.c/chk.a, e.g., code path with no intervening stores. + * + * If there is a store after the advanced load, one must either do a ld.c.* or + * chk.a.* to reuse the value stored in the ALAT. Both can "fail" (meaning no + * entry found in ALAT), and that's perfectly ok because: + * + * - ld.c.*, if the entry is not present a normal load is executed + * - chk.a.*, if the entry is not present, execution jumps to recovery code + * + * In either case, the load can be potentially retried in another form. + * + * ALAT must be invalidated for the register (so that chk.a or ld.c don't pick + * up a stale entry later). The register base update MUST also be performed. + */ + + /* + * when the load has the .acq completer then + * use ordering fence. + */ + if (ld.x6_op == 0x5 || ld.x6_op == 0xa) + mb(); + + /* + * invalidate ALAT entry in case of advanced load + */ + if (ld.x6_op == 0x2) + invala_gr(ld.r1); + + return 0; +} + +static int +emulate_store_int (unsigned long ifa, load_store_t ld, struct pt_regs *regs) +{ + unsigned long r2; + unsigned int len = 1 << ld.x6_sz; + + /* + * if we get to this handler, Nat bits on both r3 and r2 have already + * been checked. so we don't need to do it + * + * extract the value to be stored + */ + getreg(ld.imm, &r2, NULL, regs); + + /* + * we rely on the macros in unaligned.h for now i.e., + * we let the compiler figure out how to read memory gracefully. + * + * We need this switch/case because the way the inline function + * works. The code is optimized by the compiler and looks like + * a single switch/case. + */ + DPRINT("st%d [%lx]=%lx\n", len, ifa, r2); + + if (len != 2 && len != 4 && len != 8) { + DPRINT("unknown size: x6=%d\n", ld.x6_sz); + return -1; + } + + /* this assumes little-endian byte-order: */ + if (copy_to_user((void __user *) ifa, &r2, len)) + return -1; + + /* + * stX [r3]=r2,imm(9) + * + * NOTE: + * ld.r3 can never be r0, because r0 would not generate an + * unaligned access. + */ + if (ld.op == 0x5) { + unsigned long imm; + + /* + * form imm9: [12:6] contain first 7bits + */ + imm = ld.x << 7 | ld.r1; + /* + * sign extend (8bits) if m set + */ + if (ld.m) imm |= SIGN_EXT9; + /* + * ifa == r3 (NaT is necessarily cleared) + */ + ifa += imm; + + DPRINT("imm=%lx r3=%lx\n", imm, ifa); + + setreg(ld.r3, ifa, 0, regs); + } + /* + * we don't have alat_invalidate_multiple() so we need + * to do the complete flush :-<< + */ + ia64_invala(); + + /* + * stX.rel: use fence instead of release + */ + if (ld.x6_op == 0xd) + mb(); + + return 0; +} + +/* + * floating point operations sizes in bytes + */ +static const unsigned char float_fsz[4]={ + 10, /* extended precision (e) */ + 8, /* integer (8) */ + 4, /* single precision (s) */ + 8 /* double precision (d) */ +}; + +static inline void +mem2float_extended (struct ia64_fpreg *init, struct ia64_fpreg *final) +{ + ia64_ldfe(6, init); + ia64_stop(); + ia64_stf_spill(final, 6); +} + +static inline void +mem2float_integer (struct ia64_fpreg *init, struct ia64_fpreg *final) +{ + ia64_ldf8(6, init); + ia64_stop(); + ia64_stf_spill(final, 6); +} + +static inline void +mem2float_single (struct ia64_fpreg *init, struct ia64_fpreg *final) +{ + ia64_ldfs(6, init); + ia64_stop(); + ia64_stf_spill(final, 6); +} + +static inline void +mem2float_double (struct ia64_fpreg *init, struct ia64_fpreg *final) +{ + ia64_ldfd(6, init); + ia64_stop(); + ia64_stf_spill(final, 6); +} + +static inline void +float2mem_extended (struct ia64_fpreg *init, struct ia64_fpreg *final) +{ + ia64_ldf_fill(6, init); + ia64_stop(); + ia64_stfe(final, 6); +} + +static inline void +float2mem_integer (struct ia64_fpreg *init, struct ia64_fpreg *final) +{ + ia64_ldf_fill(6, init); + ia64_stop(); + ia64_stf8(final, 6); +} + +static inline void +float2mem_single (struct ia64_fpreg *init, struct ia64_fpreg *final) +{ + ia64_ldf_fill(6, init); + ia64_stop(); + ia64_stfs(final, 6); +} + +static inline void +float2mem_double (struct ia64_fpreg *init, struct ia64_fpreg *final) +{ + ia64_ldf_fill(6, init); + ia64_stop(); + ia64_stfd(final, 6); +} + +static int +emulate_load_floatpair (unsigned long ifa, load_store_t ld, struct pt_regs *regs) +{ + struct ia64_fpreg fpr_init[2]; + struct ia64_fpreg fpr_final[2]; + unsigned long len = float_fsz[ld.x6_sz]; + + /* + * fr0 & fr1 don't need to be checked because Illegal Instruction faults have + * higher priority than unaligned faults. + * + * r0 cannot be found as the base as it would never generate an unaligned + * reference. + */ + + /* + * make sure we get clean buffers + */ + memset(&fpr_init, 0, sizeof(fpr_init)); + memset(&fpr_final, 0, sizeof(fpr_final)); + + /* + * ldfpX.a: we don't try to emulate anything but we must + * invalidate the ALAT entry and execute updates, if any. + */ + if (ld.x6_op != 0x2) { + /* + * This assumes little-endian byte-order. Note that there is no "ldfpe" + * instruction: + */ + if (copy_from_user(&fpr_init[0], (void __user *) ifa, len) + || copy_from_user(&fpr_init[1], (void __user *) (ifa + len), len)) + return -1; + + DPRINT("ld.r1=%d ld.imm=%d x6_sz=%d\n", ld.r1, ld.imm, ld.x6_sz); + DDUMP("frp_init =", &fpr_init, 2*len); + /* + * XXX fixme + * Could optimize inlines by using ldfpX & 2 spills + */ + switch( ld.x6_sz ) { + case 0: + mem2float_extended(&fpr_init[0], &fpr_final[0]); + mem2float_extended(&fpr_init[1], &fpr_final[1]); + break; + case 1: + mem2float_integer(&fpr_init[0], &fpr_final[0]); + mem2float_integer(&fpr_init[1], &fpr_final[1]); + break; + case 2: + mem2float_single(&fpr_init[0], &fpr_final[0]); + mem2float_single(&fpr_init[1], &fpr_final[1]); + break; + case 3: + mem2float_double(&fpr_init[0], &fpr_final[0]); + mem2float_double(&fpr_init[1], &fpr_final[1]); + break; + } + DDUMP("fpr_final =", &fpr_final, 2*len); + /* + * XXX fixme + * + * A possible optimization would be to drop fpr_final and directly + * use the storage from the saved context i.e., the actual final + * destination (pt_regs, switch_stack or thread structure). + */ + setfpreg(ld.r1, &fpr_final[0], regs); + setfpreg(ld.imm, &fpr_final[1], regs); + } + + /* + * Check for updates: only immediate updates are available for this + * instruction. + */ + if (ld.m) { + /* + * the immediate is implicit given the ldsz of the operation: + * single: 8 (2x4) and for all others it's 16 (2x8) + */ + ifa += len<<1; + + /* + * IMPORTANT: + * the fact that we force the NaT of r3 to zero is ONLY valid + * as long as we don't come here with a ldfpX.s. + * For this reason we keep this sanity check + */ + if (ld.x6_op == 1 || ld.x6_op == 3) + printk(KERN_ERR "%s: register update on speculative load pair, error\n", + __FUNCTION__); + + setreg(ld.r3, ifa, 0, regs); + } + + /* + * Invalidate ALAT entries, if any, for both registers. + */ + if (ld.x6_op == 0x2) { + invala_fr(ld.r1); + invala_fr(ld.imm); + } + return 0; +} + + +static int +emulate_load_float (unsigned long ifa, load_store_t ld, struct pt_regs *regs) +{ + struct ia64_fpreg fpr_init; + struct ia64_fpreg fpr_final; + unsigned long len = float_fsz[ld.x6_sz]; + + /* + * fr0 & fr1 don't need to be checked because Illegal Instruction + * faults have higher priority than unaligned faults. + * + * r0 cannot be found as the base as it would never generate an + * unaligned reference. + */ + + /* + * make sure we get clean buffers + */ + memset(&fpr_init,0, sizeof(fpr_init)); + memset(&fpr_final,0, sizeof(fpr_final)); + + /* + * ldfX.a we don't try to emulate anything but we must + * invalidate the ALAT entry. + * See comments in ldX for descriptions on how the various loads are handled. + */ + if (ld.x6_op != 0x2) { + if (copy_from_user(&fpr_init, (void __user *) ifa, len)) + return -1; + + DPRINT("ld.r1=%d x6_sz=%d\n", ld.r1, ld.x6_sz); + DDUMP("fpr_init =", &fpr_init, len); + /* + * we only do something for x6_op={0,8,9} + */ + switch( ld.x6_sz ) { + case 0: + mem2float_extended(&fpr_init, &fpr_final); + break; + case 1: + mem2float_integer(&fpr_init, &fpr_final); + break; + case 2: + mem2float_single(&fpr_init, &fpr_final); + break; + case 3: + mem2float_double(&fpr_init, &fpr_final); + break; + } + DDUMP("fpr_final =", &fpr_final, len); + /* + * XXX fixme + * + * A possible optimization would be to drop fpr_final and directly + * use the storage from the saved context i.e., the actual final + * destination (pt_regs, switch_stack or thread structure). + */ + setfpreg(ld.r1, &fpr_final, regs); + } + + /* + * check for updates on any loads + */ + if (ld.op == 0x7 || ld.m) + emulate_load_updates(ld.op == 0x7 ? UPD_IMMEDIATE: UPD_REG, ld, regs, ifa); + + /* + * invalidate ALAT entry in case of advanced floating point loads + */ + if (ld.x6_op == 0x2) + invala_fr(ld.r1); + + return 0; +} + + +static int +emulate_store_float (unsigned long ifa, load_store_t ld, struct pt_regs *regs) +{ + struct ia64_fpreg fpr_init; + struct ia64_fpreg fpr_final; + unsigned long len = float_fsz[ld.x6_sz]; + + /* + * make sure we get clean buffers + */ + memset(&fpr_init,0, sizeof(fpr_init)); + memset(&fpr_final,0, sizeof(fpr_final)); + + /* + * if we get to this handler, Nat bits on both r3 and r2 have already + * been checked. so we don't need to do it + * + * extract the value to be stored + */ + getfpreg(ld.imm, &fpr_init, regs); + /* + * during this step, we extract the spilled registers from the saved + * context i.e., we refill. Then we store (no spill) to temporary + * aligned location + */ + switch( ld.x6_sz ) { + case 0: + float2mem_extended(&fpr_init, &fpr_final); + break; + case 1: + float2mem_integer(&fpr_init, &fpr_final); + break; + case 2: + float2mem_single(&fpr_init, &fpr_final); + break; + case 3: + float2mem_double(&fpr_init, &fpr_final); + break; + } + DPRINT("ld.r1=%d x6_sz=%d\n", ld.r1, ld.x6_sz); + DDUMP("fpr_init =", &fpr_init, len); + DDUMP("fpr_final =", &fpr_final, len); + + if (copy_to_user((void __user *) ifa, &fpr_final, len)) + return -1; + + /* + * stfX [r3]=r2,imm(9) + * + * NOTE: + * ld.r3 can never be r0, because r0 would not generate an + * unaligned access. + */ + if (ld.op == 0x7) { + unsigned long imm; + + /* + * form imm9: [12:6] contain first 7bits + */ + imm = ld.x << 7 | ld.r1; + /* + * sign extend (8bits) if m set + */ + if (ld.m) + imm |= SIGN_EXT9; + /* + * ifa == r3 (NaT is necessarily cleared) + */ + ifa += imm; + + DPRINT("imm=%lx r3=%lx\n", imm, ifa); + + setreg(ld.r3, ifa, 0, regs); + } + /* + * we don't have alat_invalidate_multiple() so we need + * to do the complete flush :-<< + */ + ia64_invala(); + + return 0; +} + +/* + * Make sure we log the unaligned access, so that user/sysadmin can notice it and + * eventually fix the program. However, we don't want to do that for every access so we + * pace it with jiffies. This isn't really MP-safe, but it doesn't really have to be + * either... + */ +static int +within_logging_rate_limit (void) +{ + static unsigned long count, last_time; + + if (jiffies - last_time > 5*HZ) + count = 0; + if (++count < 5) { + last_time = jiffies; + return 1; + } + return 0; + +} + +void +ia64_handle_unaligned (unsigned long ifa, struct pt_regs *regs) +{ + struct ia64_psr *ipsr = ia64_psr(regs); + mm_segment_t old_fs = get_fs(); + unsigned long bundle[2]; + unsigned long opcode; + struct siginfo si; + const struct exception_table_entry *eh = NULL; + union { + unsigned long l; + load_store_t insn; + } u; + int ret = -1; + + if (ia64_psr(regs)->be) { + /* we don't support big-endian accesses */ + die_if_kernel("big-endian unaligned accesses are not supported", regs, 0); + goto force_sigbus; + } + + /* + * Treat kernel accesses for which there is an exception handler entry the same as + * user-level unaligned accesses. Otherwise, a clever program could trick this + * handler into reading an arbitrary kernel addresses... + */ + if (!user_mode(regs)) + eh = search_exception_tables(regs->cr_iip + ia64_psr(regs)->ri); + if (user_mode(regs) || eh) { + if ((current->thread.flags & IA64_THREAD_UAC_SIGBUS) != 0) + goto force_sigbus; + + if (!(current->thread.flags & IA64_THREAD_UAC_NOPRINT) + && within_logging_rate_limit()) + { + char buf[200]; /* comm[] is at most 16 bytes... */ + size_t len; + + len = sprintf(buf, "%s(%d): unaligned access to 0x%016lx, " + "ip=0x%016lx\n\r", current->comm, current->pid, + ifa, regs->cr_iip + ipsr->ri); + /* + * Don't call tty_write_message() if we're in the kernel; we might + * be holding locks... + */ + if (user_mode(regs)) + tty_write_message(current->signal->tty, buf); + buf[len-1] = '\0'; /* drop '\r' */ + printk(KERN_WARNING "%s", buf); /* watch for command names containing %s */ + } + } else { + if (within_logging_rate_limit()) + printk(KERN_WARNING "kernel unaligned access to 0x%016lx, ip=0x%016lx\n", + ifa, regs->cr_iip + ipsr->ri); + set_fs(KERNEL_DS); + } + + DPRINT("iip=%lx ifa=%lx isr=%lx (ei=%d, sp=%d)\n", + regs->cr_iip, ifa, regs->cr_ipsr, ipsr->ri, ipsr->it); + + if (__copy_from_user(bundle, (void __user *) regs->cr_iip, 16)) + goto failure; + + /* + * extract the instruction from the bundle given the slot number + */ + switch (ipsr->ri) { + case 0: u.l = (bundle[0] >> 5); break; + case 1: u.l = (bundle[0] >> 46) | (bundle[1] << 18); break; + case 2: u.l = (bundle[1] >> 23); break; + } + opcode = (u.l >> IA64_OPCODE_SHIFT) & IA64_OPCODE_MASK; + + DPRINT("opcode=%lx ld.qp=%d ld.r1=%d ld.imm=%d ld.r3=%d ld.x=%d ld.hint=%d " + "ld.x6=0x%x ld.m=%d ld.op=%d\n", opcode, u.insn.qp, u.insn.r1, u.insn.imm, + u.insn.r3, u.insn.x, u.insn.hint, u.insn.x6_sz, u.insn.m, u.insn.op); + + /* + * IMPORTANT: + * Notice that the switch statement DOES not cover all possible instructions + * that DO generate unaligned references. This is made on purpose because for some + * instructions it DOES NOT make sense to try and emulate the access. Sometimes it + * is WRONG to try and emulate. Here is a list of instruction we don't emulate i.e., + * the program will get a signal and die: + * + * load/store: + * - ldX.spill + * - stX.spill + * Reason: RNATs are based on addresses + * - ld16 + * - st16 + * Reason: ld16 and st16 are supposed to occur in a single + * memory op + * + * synchronization: + * - cmpxchg + * - fetchadd + * - xchg + * Reason: ATOMIC operations cannot be emulated properly using multiple + * instructions. + * + * speculative loads: + * - ldX.sZ + * Reason: side effects, code must be ready to deal with failure so simpler + * to let the load fail. + * --------------------------------------------------------------------------------- + * XXX fixme + * + * I would like to get rid of this switch case and do something + * more elegant. + */ + switch (opcode) { + case LDS_OP: + case LDSA_OP: + if (u.insn.x) + /* oops, really a semaphore op (cmpxchg, etc) */ + goto failure; + /* no break */ + case LDS_IMM_OP: + case LDSA_IMM_OP: + case LDFS_OP: + case LDFSA_OP: + case LDFS_IMM_OP: + /* + * The instruction will be retried with deferred exceptions turned on, and + * we should get Nat bit installed + * + * IMPORTANT: When PSR_ED is set, the register & immediate update forms + * are actually executed even though the operation failed. So we don't + * need to take care of this. + */ + DPRINT("forcing PSR_ED\n"); + regs->cr_ipsr |= IA64_PSR_ED; + goto done; + + case LD_OP: + case LDA_OP: + case LDBIAS_OP: + case LDACQ_OP: + case LDCCLR_OP: + case LDCNC_OP: + case LDCCLRACQ_OP: + if (u.insn.x) + /* oops, really a semaphore op (cmpxchg, etc) */ + goto failure; + /* no break */ + case LD_IMM_OP: + case LDA_IMM_OP: + case LDBIAS_IMM_OP: + case LDACQ_IMM_OP: + case LDCCLR_IMM_OP: + case LDCNC_IMM_OP: + case LDCCLRACQ_IMM_OP: + ret = emulate_load_int(ifa, u.insn, regs); + break; + + case ST_OP: + case STREL_OP: + if (u.insn.x) + /* oops, really a semaphore op (cmpxchg, etc) */ + goto failure; + /* no break */ + case ST_IMM_OP: + case STREL_IMM_OP: + ret = emulate_store_int(ifa, u.insn, regs); + break; + + case LDF_OP: + case LDFA_OP: + case LDFCCLR_OP: + case LDFCNC_OP: + case LDF_IMM_OP: + case LDFA_IMM_OP: + case LDFCCLR_IMM_OP: + case LDFCNC_IMM_OP: + if (u.insn.x) + ret = emulate_load_floatpair(ifa, u.insn, regs); + else + ret = emulate_load_float(ifa, u.insn, regs); + break; + + case STF_OP: + case STF_IMM_OP: + ret = emulate_store_float(ifa, u.insn, regs); + break; + + default: + goto failure; + } + DPRINT("ret=%d\n", ret); + if (ret) + goto failure; + + if (ipsr->ri == 2) + /* + * given today's architecture this case is not likely to happen because a + * memory access instruction (M) can never be in the last slot of a + * bundle. But let's keep it for now. + */ + regs->cr_iip += 16; + ipsr->ri = (ipsr->ri + 1) & 0x3; + + DPRINT("ipsr->ri=%d iip=%lx\n", ipsr->ri, regs->cr_iip); + done: + set_fs(old_fs); /* restore original address limit */ + return; + + failure: + /* something went wrong... */ + if (!user_mode(regs)) { + if (eh) { + ia64_handle_exception(regs, eh); + goto done; + } + die_if_kernel("error during unaligned kernel access\n", regs, ret); + /* NOT_REACHED */ + } + force_sigbus: + si.si_signo = SIGBUS; + si.si_errno = 0; + si.si_code = BUS_ADRALN; + si.si_addr = (void __user *) ifa; + si.si_flags = 0; + si.si_isr = 0; + si.si_imm = 0; + force_sig_info(SIGBUS, &si, current); + goto done; +} diff --git a/arch/ia64/kernel/unwind.c b/arch/ia64/kernel/unwind.c new file mode 100644 index 000000000000..d494ff647cac --- /dev/null +++ b/arch/ia64/kernel/unwind.c @@ -0,0 +1,2306 @@ +/* + * Copyright (C) 1999-2004 Hewlett-Packard Co + * David Mosberger-Tang <davidm@hpl.hp.com> + * Copyright (C) 2003 Fenghua Yu <fenghua.yu@intel.com> + * - Change pt_regs_off() to make it less dependant on pt_regs structure. + */ +/* + * This file implements call frame unwind support for the Linux + * kernel. Parsing and processing the unwind information is + * time-consuming, so this implementation translates the unwind + * descriptors into unwind scripts. These scripts are very simple + * (basically a sequence of assignments) and efficient to execute. + * They are cached for later re-use. Each script is specific for a + * given instruction pointer address and the set of predicate values + * that the script depends on (most unwind descriptors are + * unconditional and scripts often do not depend on predicates at + * all). This code is based on the unwind conventions described in + * the "IA-64 Software Conventions and Runtime Architecture" manual. + * + * SMP conventions: + * o updates to the global unwind data (in structure "unw") are serialized + * by the unw.lock spinlock + * o each unwind script has its own read-write lock; a thread must acquire + * a read lock before executing a script and must acquire a write lock + * before modifying a script + * o if both the unw.lock spinlock and a script's read-write lock must be + * acquired, then the read-write lock must be acquired first. + */ +#include <linux/module.h> +#include <linux/bootmem.h> +#include <linux/elf.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/slab.h> + +#include <asm/unwind.h> + +#include <asm/delay.h> +#include <asm/page.h> +#include <asm/ptrace.h> +#include <asm/ptrace_offsets.h> +#include <asm/rse.h> +#include <asm/sections.h> +#include <asm/system.h> +#include <asm/uaccess.h> + +#include "entry.h" +#include "unwind_i.h" + +#define UNW_LOG_CACHE_SIZE 7 /* each unw_script is ~256 bytes in size */ +#define UNW_CACHE_SIZE (1 << UNW_LOG_CACHE_SIZE) + +#define UNW_LOG_HASH_SIZE (UNW_LOG_CACHE_SIZE + 1) +#define UNW_HASH_SIZE (1 << UNW_LOG_HASH_SIZE) + +#define UNW_STATS 0 /* WARNING: this disabled interrupts for long time-spans!! */ + +#ifdef UNW_DEBUG + static unsigned int unw_debug_level = UNW_DEBUG; +# define UNW_DEBUG_ON(n) unw_debug_level >= n + /* Do not code a printk level, not all debug lines end in newline */ +# define UNW_DPRINT(n, ...) if (UNW_DEBUG_ON(n)) printk(__VA_ARGS__) +# define inline +#else /* !UNW_DEBUG */ +# define UNW_DEBUG_ON(n) 0 +# define UNW_DPRINT(n, ...) +#endif /* UNW_DEBUG */ + +#if UNW_STATS +# define STAT(x...) x +#else +# define STAT(x...) +#endif + +#define alloc_reg_state() kmalloc(sizeof(struct unw_reg_state), GFP_ATOMIC) +#define free_reg_state(usr) kfree(usr) +#define alloc_labeled_state() kmalloc(sizeof(struct unw_labeled_state), GFP_ATOMIC) +#define free_labeled_state(usr) kfree(usr) + +typedef unsigned long unw_word; +typedef unsigned char unw_hash_index_t; + +static struct { + spinlock_t lock; /* spinlock for unwind data */ + + /* list of unwind tables (one per load-module) */ + struct unw_table *tables; + + unsigned long r0; /* constant 0 for r0 */ + + /* table of registers that prologues can save (and order in which they're saved): */ + const unsigned char save_order[8]; + + /* maps a preserved register index (preg_index) to corresponding switch_stack offset: */ + unsigned short sw_off[sizeof(struct unw_frame_info) / 8]; + + unsigned short lru_head; /* index of lead-recently used script */ + unsigned short lru_tail; /* index of most-recently used script */ + + /* index into unw_frame_info for preserved register i */ + unsigned short preg_index[UNW_NUM_REGS]; + + short pt_regs_offsets[32]; + + /* unwind table for the kernel: */ + struct unw_table kernel_table; + + /* unwind table describing the gate page (kernel code that is mapped into user space): */ + size_t gate_table_size; + unsigned long *gate_table; + + /* hash table that maps instruction pointer to script index: */ + unsigned short hash[UNW_HASH_SIZE]; + + /* script cache: */ + struct unw_script cache[UNW_CACHE_SIZE]; + +# ifdef UNW_DEBUG + const char *preg_name[UNW_NUM_REGS]; +# endif +# if UNW_STATS + struct { + struct { + int lookups; + int hinted_hits; + int normal_hits; + int collision_chain_traversals; + } cache; + struct { + unsigned long build_time; + unsigned long run_time; + unsigned long parse_time; + int builds; + int news; + int collisions; + int runs; + } script; + struct { + unsigned long init_time; + unsigned long unwind_time; + int inits; + int unwinds; + } api; + } stat; +# endif +} unw = { + .tables = &unw.kernel_table, + .lock = SPIN_LOCK_UNLOCKED, + .save_order = { + UNW_REG_RP, UNW_REG_PFS, UNW_REG_PSP, UNW_REG_PR, + UNW_REG_UNAT, UNW_REG_LC, UNW_REG_FPSR, UNW_REG_PRI_UNAT_GR + }, + .preg_index = { + offsetof(struct unw_frame_info, pri_unat_loc)/8, /* PRI_UNAT_GR */ + offsetof(struct unw_frame_info, pri_unat_loc)/8, /* PRI_UNAT_MEM */ + offsetof(struct unw_frame_info, bsp_loc)/8, + offsetof(struct unw_frame_info, bspstore_loc)/8, + offsetof(struct unw_frame_info, pfs_loc)/8, + offsetof(struct unw_frame_info, rnat_loc)/8, + offsetof(struct unw_frame_info, psp)/8, + offsetof(struct unw_frame_info, rp_loc)/8, + offsetof(struct unw_frame_info, r4)/8, + offsetof(struct unw_frame_info, r5)/8, + offsetof(struct unw_frame_info, r6)/8, + offsetof(struct unw_frame_info, r7)/8, + offsetof(struct unw_frame_info, unat_loc)/8, + offsetof(struct unw_frame_info, pr_loc)/8, + offsetof(struct unw_frame_info, lc_loc)/8, + offsetof(struct unw_frame_info, fpsr_loc)/8, + offsetof(struct unw_frame_info, b1_loc)/8, + offsetof(struct unw_frame_info, b2_loc)/8, + offsetof(struct unw_frame_info, b3_loc)/8, + offsetof(struct unw_frame_info, b4_loc)/8, + offsetof(struct unw_frame_info, b5_loc)/8, + offsetof(struct unw_frame_info, f2_loc)/8, + offsetof(struct unw_frame_info, f3_loc)/8, + offsetof(struct unw_frame_info, f4_loc)/8, + offsetof(struct unw_frame_info, f5_loc)/8, + offsetof(struct unw_frame_info, fr_loc[16 - 16])/8, + offsetof(struct unw_frame_info, fr_loc[17 - 16])/8, + offsetof(struct unw_frame_info, fr_loc[18 - 16])/8, + offsetof(struct unw_frame_info, fr_loc[19 - 16])/8, + offsetof(struct unw_frame_info, fr_loc[20 - 16])/8, + offsetof(struct unw_frame_info, fr_loc[21 - 16])/8, + offsetof(struct unw_frame_info, fr_loc[22 - 16])/8, + offsetof(struct unw_frame_info, fr_loc[23 - 16])/8, + offsetof(struct unw_frame_info, fr_loc[24 - 16])/8, + offsetof(struct unw_frame_info, fr_loc[25 - 16])/8, + offsetof(struct unw_frame_info, fr_loc[26 - 16])/8, + offsetof(struct unw_frame_info, fr_loc[27 - 16])/8, + offsetof(struct unw_frame_info, fr_loc[28 - 16])/8, + offsetof(struct unw_frame_info, fr_loc[29 - 16])/8, + offsetof(struct unw_frame_info, fr_loc[30 - 16])/8, + offsetof(struct unw_frame_info, fr_loc[31 - 16])/8, + }, + .pt_regs_offsets = { + [0] = -1, + offsetof(struct pt_regs, r1), + offsetof(struct pt_regs, r2), + offsetof(struct pt_regs, r3), + [4] = -1, [5] = -1, [6] = -1, [7] = -1, + offsetof(struct pt_regs, r8), + offsetof(struct pt_regs, r9), + offsetof(struct pt_regs, r10), + offsetof(struct pt_regs, r11), + offsetof(struct pt_regs, r12), + offsetof(struct pt_regs, r13), + offsetof(struct pt_regs, r14), + offsetof(struct pt_regs, r15), + offsetof(struct pt_regs, r16), + offsetof(struct pt_regs, r17), + offsetof(struct pt_regs, r18), + offsetof(struct pt_regs, r19), + offsetof(struct pt_regs, r20), + offsetof(struct pt_regs, r21), + offsetof(struct pt_regs, r22), + offsetof(struct pt_regs, r23), + offsetof(struct pt_regs, r24), + offsetof(struct pt_regs, r25), + offsetof(struct pt_regs, r26), + offsetof(struct pt_regs, r27), + offsetof(struct pt_regs, r28), + offsetof(struct pt_regs, r29), + offsetof(struct pt_regs, r30), + offsetof(struct pt_regs, r31), + }, + .hash = { [0 ... UNW_HASH_SIZE - 1] = -1 }, +#ifdef UNW_DEBUG + .preg_name = { + "pri_unat_gr", "pri_unat_mem", "bsp", "bspstore", "ar.pfs", "ar.rnat", "psp", "rp", + "r4", "r5", "r6", "r7", + "ar.unat", "pr", "ar.lc", "ar.fpsr", + "b1", "b2", "b3", "b4", "b5", + "f2", "f3", "f4", "f5", + "f16", "f17", "f18", "f19", "f20", "f21", "f22", "f23", + "f24", "f25", "f26", "f27", "f28", "f29", "f30", "f31" + } +#endif +}; + +static inline int +read_only (void *addr) +{ + return (unsigned long) ((char *) addr - (char *) &unw.r0) < sizeof(unw.r0); +} + +/* + * Returns offset of rREG in struct pt_regs. + */ +static inline unsigned long +pt_regs_off (unsigned long reg) +{ + short off = -1; + + if (reg < ARRAY_SIZE(unw.pt_regs_offsets)) + off = unw.pt_regs_offsets[reg]; + + if (off < 0) { + UNW_DPRINT(0, "unwind.%s: bad scratch reg r%lu\n", __FUNCTION__, reg); + off = 0; + } + return (unsigned long) off; +} + +static inline struct pt_regs * +get_scratch_regs (struct unw_frame_info *info) +{ + if (!info->pt) { + /* This should not happen with valid unwind info. */ + UNW_DPRINT(0, "unwind.%s: bad unwind info: resetting info->pt\n", __FUNCTION__); + if (info->flags & UNW_FLAG_INTERRUPT_FRAME) + info->pt = (unsigned long) ((struct pt_regs *) info->psp - 1); + else + info->pt = info->sp - 16; + } + UNW_DPRINT(3, "unwind.%s: sp 0x%lx pt 0x%lx\n", __FUNCTION__, info->sp, info->pt); + return (struct pt_regs *) info->pt; +} + +/* Unwind accessors. */ + +int +unw_access_gr (struct unw_frame_info *info, int regnum, unsigned long *val, char *nat, int write) +{ + unsigned long *addr, *nat_addr, nat_mask = 0, dummy_nat; + struct unw_ireg *ireg; + struct pt_regs *pt; + + if ((unsigned) regnum - 1 >= 127) { + if (regnum == 0 && !write) { + *val = 0; /* read r0 always returns 0 */ + *nat = 0; + return 0; + } + UNW_DPRINT(0, "unwind.%s: trying to access non-existent r%u\n", + __FUNCTION__, regnum); + return -1; + } + + if (regnum < 32) { + if (regnum >= 4 && regnum <= 7) { + /* access a preserved register */ + ireg = &info->r4 + (regnum - 4); + addr = ireg->loc; + if (addr) { + nat_addr = addr + ireg->nat.off; + switch (ireg->nat.type) { + case UNW_NAT_VAL: + /* simulate getf.sig/setf.sig */ + if (write) { + if (*nat) { + /* write NaTVal and be done with it */ + addr[0] = 0; + addr[1] = 0x1fffe; + return 0; + } + addr[1] = 0x1003e; + } else { + if (addr[0] == 0 && addr[1] == 0x1ffe) { + /* return NaT and be done with it */ + *val = 0; + *nat = 1; + return 0; + } + } + /* fall through */ + case UNW_NAT_NONE: + dummy_nat = 0; + nat_addr = &dummy_nat; + break; + + case UNW_NAT_MEMSTK: + nat_mask = (1UL << ((long) addr & 0x1f8)/8); + break; + + case UNW_NAT_REGSTK: + nat_addr = ia64_rse_rnat_addr(addr); + if ((unsigned long) addr < info->regstk.limit + || (unsigned long) addr >= info->regstk.top) + { + UNW_DPRINT(0, "unwind.%s: %p outside of regstk " + "[0x%lx-0x%lx)\n", + __FUNCTION__, (void *) addr, + info->regstk.limit, + info->regstk.top); + return -1; + } + if ((unsigned long) nat_addr >= info->regstk.top) + nat_addr = &info->sw->ar_rnat; + nat_mask = (1UL << ia64_rse_slot_num(addr)); + break; + } + } else { + addr = &info->sw->r4 + (regnum - 4); + nat_addr = &info->sw->ar_unat; + nat_mask = (1UL << ((long) addr & 0x1f8)/8); + } + } else { + /* access a scratch register */ + pt = get_scratch_regs(info); + addr = (unsigned long *) ((unsigned long)pt + pt_regs_off(regnum)); + if (info->pri_unat_loc) + nat_addr = info->pri_unat_loc; + else + nat_addr = &info->sw->ar_unat; + nat_mask = (1UL << ((long) addr & 0x1f8)/8); + } + } else { + /* access a stacked register */ + addr = ia64_rse_skip_regs((unsigned long *) info->bsp, regnum - 32); + nat_addr = ia64_rse_rnat_addr(addr); + if ((unsigned long) addr < info->regstk.limit + || (unsigned long) addr >= info->regstk.top) + { + UNW_DPRINT(0, "unwind.%s: ignoring attempt to access register outside " + "of rbs\n", __FUNCTION__); + return -1; + } + if ((unsigned long) nat_addr >= info->regstk.top) + nat_addr = &info->sw->ar_rnat; + nat_mask = (1UL << ia64_rse_slot_num(addr)); + } + + if (write) { + if (read_only(addr)) { + UNW_DPRINT(0, "unwind.%s: ignoring attempt to write read-only location\n", + __FUNCTION__); + } else { + *addr = *val; + if (*nat) + *nat_addr |= nat_mask; + else + *nat_addr &= ~nat_mask; + } + } else { + if ((*nat_addr & nat_mask) == 0) { + *val = *addr; + *nat = 0; + } else { + *val = 0; /* if register is a NaT, *addr may contain kernel data! */ + *nat = 1; + } + } + return 0; +} +EXPORT_SYMBOL(unw_access_gr); + +int +unw_access_br (struct unw_frame_info *info, int regnum, unsigned long *val, int write) +{ + unsigned long *addr; + struct pt_regs *pt; + + switch (regnum) { + /* scratch: */ + case 0: pt = get_scratch_regs(info); addr = &pt->b0; break; + case 6: pt = get_scratch_regs(info); addr = &pt->b6; break; + case 7: pt = get_scratch_regs(info); addr = &pt->b7; break; + + /* preserved: */ + case 1: case 2: case 3: case 4: case 5: + addr = *(&info->b1_loc + (regnum - 1)); + if (!addr) + addr = &info->sw->b1 + (regnum - 1); + break; + + default: + UNW_DPRINT(0, "unwind.%s: trying to access non-existent b%u\n", + __FUNCTION__, regnum); + return -1; + } + if (write) + if (read_only(addr)) { + UNW_DPRINT(0, "unwind.%s: ignoring attempt to write read-only location\n", + __FUNCTION__); + } else + *addr = *val; + else + *val = *addr; + return 0; +} +EXPORT_SYMBOL(unw_access_br); + +int +unw_access_fr (struct unw_frame_info *info, int regnum, struct ia64_fpreg *val, int write) +{ + struct ia64_fpreg *addr = NULL; + struct pt_regs *pt; + + if ((unsigned) (regnum - 2) >= 126) { + UNW_DPRINT(0, "unwind.%s: trying to access non-existent f%u\n", + __FUNCTION__, regnum); + return -1; + } + + if (regnum <= 5) { + addr = *(&info->f2_loc + (regnum - 2)); + if (!addr) + addr = &info->sw->f2 + (regnum - 2); + } else if (regnum <= 15) { + if (regnum <= 11) { + pt = get_scratch_regs(info); + addr = &pt->f6 + (regnum - 6); + } + else + addr = &info->sw->f12 + (regnum - 12); + } else if (regnum <= 31) { + addr = info->fr_loc[regnum - 16]; + if (!addr) + addr = &info->sw->f16 + (regnum - 16); + } else { + struct task_struct *t = info->task; + + if (write) + ia64_sync_fph(t); + else + ia64_flush_fph(t); + addr = t->thread.fph + (regnum - 32); + } + + if (write) + if (read_only(addr)) { + UNW_DPRINT(0, "unwind.%s: ignoring attempt to write read-only location\n", + __FUNCTION__); + } else + *addr = *val; + else + *val = *addr; + return 0; +} +EXPORT_SYMBOL(unw_access_fr); + +int +unw_access_ar (struct unw_frame_info *info, int regnum, unsigned long *val, int write) +{ + unsigned long *addr; + struct pt_regs *pt; + + switch (regnum) { + case UNW_AR_BSP: + addr = info->bsp_loc; + if (!addr) + addr = &info->sw->ar_bspstore; + break; + + case UNW_AR_BSPSTORE: + addr = info->bspstore_loc; + if (!addr) + addr = &info->sw->ar_bspstore; + break; + + case UNW_AR_PFS: + addr = info->pfs_loc; + if (!addr) + addr = &info->sw->ar_pfs; + break; + + case UNW_AR_RNAT: + addr = info->rnat_loc; + if (!addr) + addr = &info->sw->ar_rnat; + break; + + case UNW_AR_UNAT: + addr = info->unat_loc; + if (!addr) + addr = &info->sw->ar_unat; + break; + + case UNW_AR_LC: + addr = info->lc_loc; + if (!addr) + addr = &info->sw->ar_lc; + break; + + case UNW_AR_EC: + if (!info->cfm_loc) + return -1; + if (write) + *info->cfm_loc = + (*info->cfm_loc & ~(0x3fUL << 52)) | ((*val & 0x3f) << 52); + else + *val = (*info->cfm_loc >> 52) & 0x3f; + return 0; + + case UNW_AR_FPSR: + addr = info->fpsr_loc; + if (!addr) + addr = &info->sw->ar_fpsr; + break; + + case UNW_AR_RSC: + pt = get_scratch_regs(info); + addr = &pt->ar_rsc; + break; + + case UNW_AR_CCV: + pt = get_scratch_regs(info); + addr = &pt->ar_ccv; + break; + + case UNW_AR_CSD: + pt = get_scratch_regs(info); + addr = &pt->ar_csd; + break; + + case UNW_AR_SSD: + pt = get_scratch_regs(info); + addr = &pt->ar_ssd; + break; + + default: + UNW_DPRINT(0, "unwind.%s: trying to access non-existent ar%u\n", + __FUNCTION__, regnum); + return -1; + } + + if (write) { + if (read_only(addr)) { + UNW_DPRINT(0, "unwind.%s: ignoring attempt to write read-only location\n", + __FUNCTION__); + } else + *addr = *val; + } else + *val = *addr; + return 0; +} +EXPORT_SYMBOL(unw_access_ar); + +int +unw_access_pr (struct unw_frame_info *info, unsigned long *val, int write) +{ + unsigned long *addr; + + addr = info->pr_loc; + if (!addr) + addr = &info->sw->pr; + + if (write) { + if (read_only(addr)) { + UNW_DPRINT(0, "unwind.%s: ignoring attempt to write read-only location\n", + __FUNCTION__); + } else + *addr = *val; + } else + *val = *addr; + return 0; +} +EXPORT_SYMBOL(unw_access_pr); + + +/* Routines to manipulate the state stack. */ + +static inline void +push (struct unw_state_record *sr) +{ + struct unw_reg_state *rs; + + rs = alloc_reg_state(); + if (!rs) { + printk(KERN_ERR "unwind: cannot stack reg state!\n"); + return; + } + memcpy(rs, &sr->curr, sizeof(*rs)); + sr->curr.next = rs; +} + +static void +pop (struct unw_state_record *sr) +{ + struct unw_reg_state *rs = sr->curr.next; + + if (!rs) { + printk(KERN_ERR "unwind: stack underflow!\n"); + return; + } + memcpy(&sr->curr, rs, sizeof(*rs)); + free_reg_state(rs); +} + +/* Make a copy of the state stack. Non-recursive to avoid stack overflows. */ +static struct unw_reg_state * +dup_state_stack (struct unw_reg_state *rs) +{ + struct unw_reg_state *copy, *prev = NULL, *first = NULL; + + while (rs) { + copy = alloc_reg_state(); + if (!copy) { + printk(KERN_ERR "unwind.dup_state_stack: out of memory\n"); + return NULL; + } + memcpy(copy, rs, sizeof(*copy)); + if (first) + prev->next = copy; + else + first = copy; + rs = rs->next; + prev = copy; + } + return first; +} + +/* Free all stacked register states (but not RS itself). */ +static void +free_state_stack (struct unw_reg_state *rs) +{ + struct unw_reg_state *p, *next; + + for (p = rs->next; p != NULL; p = next) { + next = p->next; + free_reg_state(p); + } + rs->next = NULL; +} + +/* Unwind decoder routines */ + +static enum unw_register_index __attribute_const__ +decode_abreg (unsigned char abreg, int memory) +{ + switch (abreg) { + case 0x04 ... 0x07: return UNW_REG_R4 + (abreg - 0x04); + case 0x22 ... 0x25: return UNW_REG_F2 + (abreg - 0x22); + case 0x30 ... 0x3f: return UNW_REG_F16 + (abreg - 0x30); + case 0x41 ... 0x45: return UNW_REG_B1 + (abreg - 0x41); + case 0x60: return UNW_REG_PR; + case 0x61: return UNW_REG_PSP; + case 0x62: return memory ? UNW_REG_PRI_UNAT_MEM : UNW_REG_PRI_UNAT_GR; + case 0x63: return UNW_REG_RP; + case 0x64: return UNW_REG_BSP; + case 0x65: return UNW_REG_BSPSTORE; + case 0x66: return UNW_REG_RNAT; + case 0x67: return UNW_REG_UNAT; + case 0x68: return UNW_REG_FPSR; + case 0x69: return UNW_REG_PFS; + case 0x6a: return UNW_REG_LC; + default: + break; + } + UNW_DPRINT(0, "unwind.%s: bad abreg=0x%x\n", __FUNCTION__, abreg); + return UNW_REG_LC; +} + +static void +set_reg (struct unw_reg_info *reg, enum unw_where where, int when, unsigned long val) +{ + reg->val = val; + reg->where = where; + if (reg->when == UNW_WHEN_NEVER) + reg->when = when; +} + +static void +alloc_spill_area (unsigned long *offp, unsigned long regsize, + struct unw_reg_info *lo, struct unw_reg_info *hi) +{ + struct unw_reg_info *reg; + + for (reg = hi; reg >= lo; --reg) { + if (reg->where == UNW_WHERE_SPILL_HOME) { + reg->where = UNW_WHERE_PSPREL; + *offp -= regsize; + reg->val = *offp; + } + } +} + +static inline void +spill_next_when (struct unw_reg_info **regp, struct unw_reg_info *lim, unw_word t) +{ + struct unw_reg_info *reg; + + for (reg = *regp; reg <= lim; ++reg) { + if (reg->where == UNW_WHERE_SPILL_HOME) { + reg->when = t; + *regp = reg + 1; + return; + } + } + UNW_DPRINT(0, "unwind.%s: excess spill!\n", __FUNCTION__); +} + +static inline void +finish_prologue (struct unw_state_record *sr) +{ + struct unw_reg_info *reg; + unsigned long off; + int i; + + /* + * First, resolve implicit register save locations (see Section "11.4.2.3 Rules + * for Using Unwind Descriptors", rule 3): + */ + for (i = 0; i < (int) ARRAY_SIZE(unw.save_order); ++i) { + reg = sr->curr.reg + unw.save_order[i]; + if (reg->where == UNW_WHERE_GR_SAVE) { + reg->where = UNW_WHERE_GR; + reg->val = sr->gr_save_loc++; + } + } + + /* + * Next, compute when the fp, general, and branch registers get + * saved. This must come before alloc_spill_area() because + * we need to know which registers are spilled to their home + * locations. + */ + if (sr->imask) { + unsigned char kind, mask = 0, *cp = sr->imask; + int t; + static const unsigned char limit[3] = { + UNW_REG_F31, UNW_REG_R7, UNW_REG_B5 + }; + struct unw_reg_info *(regs[3]); + + regs[0] = sr->curr.reg + UNW_REG_F2; + regs[1] = sr->curr.reg + UNW_REG_R4; + regs[2] = sr->curr.reg + UNW_REG_B1; + + for (t = 0; t < sr->region_len; ++t) { + if ((t & 3) == 0) + mask = *cp++; + kind = (mask >> 2*(3-(t & 3))) & 3; + if (kind > 0) + spill_next_when(®s[kind - 1], sr->curr.reg + limit[kind - 1], + sr->region_start + t); + } + } + /* + * Next, lay out the memory stack spill area: + */ + if (sr->any_spills) { + off = sr->spill_offset; + alloc_spill_area(&off, 16, sr->curr.reg + UNW_REG_F2, sr->curr.reg + UNW_REG_F31); + alloc_spill_area(&off, 8, sr->curr.reg + UNW_REG_B1, sr->curr.reg + UNW_REG_B5); + alloc_spill_area(&off, 8, sr->curr.reg + UNW_REG_R4, sr->curr.reg + UNW_REG_R7); + } +} + +/* + * Region header descriptors. + */ + +static void +desc_prologue (int body, unw_word rlen, unsigned char mask, unsigned char grsave, + struct unw_state_record *sr) +{ + int i, region_start; + + if (!(sr->in_body || sr->first_region)) + finish_prologue(sr); + sr->first_region = 0; + + /* check if we're done: */ + if (sr->when_target < sr->region_start + sr->region_len) { + sr->done = 1; + return; + } + + region_start = sr->region_start + sr->region_len; + + for (i = 0; i < sr->epilogue_count; ++i) + pop(sr); + sr->epilogue_count = 0; + sr->epilogue_start = UNW_WHEN_NEVER; + + sr->region_start = region_start; + sr->region_len = rlen; + sr->in_body = body; + + if (!body) { + push(sr); + + for (i = 0; i < 4; ++i) { + if (mask & 0x8) + set_reg(sr->curr.reg + unw.save_order[i], UNW_WHERE_GR, + sr->region_start + sr->region_len - 1, grsave++); + mask <<= 1; + } + sr->gr_save_loc = grsave; + sr->any_spills = 0; + sr->imask = NULL; + sr->spill_offset = 0x10; /* default to psp+16 */ + } +} + +/* + * Prologue descriptors. + */ + +static inline void +desc_abi (unsigned char abi, unsigned char context, struct unw_state_record *sr) +{ + if (abi == 3 && context == 'i') { + sr->flags |= UNW_FLAG_INTERRUPT_FRAME; + UNW_DPRINT(3, "unwind.%s: interrupt frame\n", __FUNCTION__); + } + else + UNW_DPRINT(0, "unwind%s: ignoring unwabi(abi=0x%x,context=0x%x)\n", + __FUNCTION__, abi, context); +} + +static inline void +desc_br_gr (unsigned char brmask, unsigned char gr, struct unw_state_record *sr) +{ + int i; + + for (i = 0; i < 5; ++i) { + if (brmask & 1) + set_reg(sr->curr.reg + UNW_REG_B1 + i, UNW_WHERE_GR, + sr->region_start + sr->region_len - 1, gr++); + brmask >>= 1; + } +} + +static inline void +desc_br_mem (unsigned char brmask, struct unw_state_record *sr) +{ + int i; + + for (i = 0; i < 5; ++i) { + if (brmask & 1) { + set_reg(sr->curr.reg + UNW_REG_B1 + i, UNW_WHERE_SPILL_HOME, + sr->region_start + sr->region_len - 1, 0); + sr->any_spills = 1; + } + brmask >>= 1; + } +} + +static inline void +desc_frgr_mem (unsigned char grmask, unw_word frmask, struct unw_state_record *sr) +{ + int i; + + for (i = 0; i < 4; ++i) { + if ((grmask & 1) != 0) { + set_reg(sr->curr.reg + UNW_REG_R4 + i, UNW_WHERE_SPILL_HOME, + sr->region_start + sr->region_len - 1, 0); + sr->any_spills = 1; + } + grmask >>= 1; + } + for (i = 0; i < 20; ++i) { + if ((frmask & 1) != 0) { + int base = (i < 4) ? UNW_REG_F2 : UNW_REG_F16 - 4; + set_reg(sr->curr.reg + base + i, UNW_WHERE_SPILL_HOME, + sr->region_start + sr->region_len - 1, 0); + sr->any_spills = 1; + } + frmask >>= 1; + } +} + +static inline void +desc_fr_mem (unsigned char frmask, struct unw_state_record *sr) +{ + int i; + + for (i = 0; i < 4; ++i) { + if ((frmask & 1) != 0) { + set_reg(sr->curr.reg + UNW_REG_F2 + i, UNW_WHERE_SPILL_HOME, + sr->region_start + sr->region_len - 1, 0); + sr->any_spills = 1; + } + frmask >>= 1; + } +} + +static inline void +desc_gr_gr (unsigned char grmask, unsigned char gr, struct unw_state_record *sr) +{ + int i; + + for (i = 0; i < 4; ++i) { + if ((grmask & 1) != 0) + set_reg(sr->curr.reg + UNW_REG_R4 + i, UNW_WHERE_GR, + sr->region_start + sr->region_len - 1, gr++); + grmask >>= 1; + } +} + +static inline void +desc_gr_mem (unsigned char grmask, struct unw_state_record *sr) +{ + int i; + + for (i = 0; i < 4; ++i) { + if ((grmask & 1) != 0) { + set_reg(sr->curr.reg + UNW_REG_R4 + i, UNW_WHERE_SPILL_HOME, + sr->region_start + sr->region_len - 1, 0); + sr->any_spills = 1; + } + grmask >>= 1; + } +} + +static inline void +desc_mem_stack_f (unw_word t, unw_word size, struct unw_state_record *sr) +{ + set_reg(sr->curr.reg + UNW_REG_PSP, UNW_WHERE_NONE, + sr->region_start + min_t(int, t, sr->region_len - 1), 16*size); +} + +static inline void +desc_mem_stack_v (unw_word t, struct unw_state_record *sr) +{ + sr->curr.reg[UNW_REG_PSP].when = sr->region_start + min_t(int, t, sr->region_len - 1); +} + +static inline void +desc_reg_gr (unsigned char reg, unsigned char dst, struct unw_state_record *sr) +{ + set_reg(sr->curr.reg + reg, UNW_WHERE_GR, sr->region_start + sr->region_len - 1, dst); +} + +static inline void +desc_reg_psprel (unsigned char reg, unw_word pspoff, struct unw_state_record *sr) +{ + set_reg(sr->curr.reg + reg, UNW_WHERE_PSPREL, sr->region_start + sr->region_len - 1, + 0x10 - 4*pspoff); +} + +static inline void +desc_reg_sprel (unsigned char reg, unw_word spoff, struct unw_state_record *sr) +{ + set_reg(sr->curr.reg + reg, UNW_WHERE_SPREL, sr->region_start + sr->region_len - 1, + 4*spoff); +} + +static inline void +desc_rp_br (unsigned char dst, struct unw_state_record *sr) +{ + sr->return_link_reg = dst; +} + +static inline void +desc_reg_when (unsigned char regnum, unw_word t, struct unw_state_record *sr) +{ + struct unw_reg_info *reg = sr->curr.reg + regnum; + + if (reg->where == UNW_WHERE_NONE) + reg->where = UNW_WHERE_GR_SAVE; + reg->when = sr->region_start + min_t(int, t, sr->region_len - 1); +} + +static inline void +desc_spill_base (unw_word pspoff, struct unw_state_record *sr) +{ + sr->spill_offset = 0x10 - 4*pspoff; +} + +static inline unsigned char * +desc_spill_mask (unsigned char *imaskp, struct unw_state_record *sr) +{ + sr->imask = imaskp; + return imaskp + (2*sr->region_len + 7)/8; +} + +/* + * Body descriptors. + */ +static inline void +desc_epilogue (unw_word t, unw_word ecount, struct unw_state_record *sr) +{ + sr->epilogue_start = sr->region_start + sr->region_len - 1 - t; + sr->epilogue_count = ecount + 1; +} + +static inline void +desc_copy_state (unw_word label, struct unw_state_record *sr) +{ + struct unw_labeled_state *ls; + + for (ls = sr->labeled_states; ls; ls = ls->next) { + if (ls->label == label) { + free_state_stack(&sr->curr); + memcpy(&sr->curr, &ls->saved_state, sizeof(sr->curr)); + sr->curr.next = dup_state_stack(ls->saved_state.next); + return; + } + } + printk(KERN_ERR "unwind: failed to find state labeled 0x%lx\n", label); +} + +static inline void +desc_label_state (unw_word label, struct unw_state_record *sr) +{ + struct unw_labeled_state *ls; + + ls = alloc_labeled_state(); + if (!ls) { + printk(KERN_ERR "unwind.desc_label_state(): out of memory\n"); + return; + } + ls->label = label; + memcpy(&ls->saved_state, &sr->curr, sizeof(ls->saved_state)); + ls->saved_state.next = dup_state_stack(sr->curr.next); + + /* insert into list of labeled states: */ + ls->next = sr->labeled_states; + sr->labeled_states = ls; +} + +/* + * General descriptors. + */ + +static inline int +desc_is_active (unsigned char qp, unw_word t, struct unw_state_record *sr) +{ + if (sr->when_target <= sr->region_start + min_t(int, t, sr->region_len - 1)) + return 0; + if (qp > 0) { + if ((sr->pr_val & (1UL << qp)) == 0) + return 0; + sr->pr_mask |= (1UL << qp); + } + return 1; +} + +static inline void +desc_restore_p (unsigned char qp, unw_word t, unsigned char abreg, struct unw_state_record *sr) +{ + struct unw_reg_info *r; + + if (!desc_is_active(qp, t, sr)) + return; + + r = sr->curr.reg + decode_abreg(abreg, 0); + r->where = UNW_WHERE_NONE; + r->when = UNW_WHEN_NEVER; + r->val = 0; +} + +static inline void +desc_spill_reg_p (unsigned char qp, unw_word t, unsigned char abreg, unsigned char x, + unsigned char ytreg, struct unw_state_record *sr) +{ + enum unw_where where = UNW_WHERE_GR; + struct unw_reg_info *r; + + if (!desc_is_active(qp, t, sr)) + return; + + if (x) + where = UNW_WHERE_BR; + else if (ytreg & 0x80) + where = UNW_WHERE_FR; + + r = sr->curr.reg + decode_abreg(abreg, 0); + r->where = where; + r->when = sr->region_start + min_t(int, t, sr->region_len - 1); + r->val = (ytreg & 0x7f); +} + +static inline void +desc_spill_psprel_p (unsigned char qp, unw_word t, unsigned char abreg, unw_word pspoff, + struct unw_state_record *sr) +{ + struct unw_reg_info *r; + + if (!desc_is_active(qp, t, sr)) + return; + + r = sr->curr.reg + decode_abreg(abreg, 1); + r->where = UNW_WHERE_PSPREL; + r->when = sr->region_start + min_t(int, t, sr->region_len - 1); + r->val = 0x10 - 4*pspoff; +} + +static inline void +desc_spill_sprel_p (unsigned char qp, unw_word t, unsigned char abreg, unw_word spoff, + struct unw_state_record *sr) +{ + struct unw_reg_info *r; + + if (!desc_is_active(qp, t, sr)) + return; + + r = sr->curr.reg + decode_abreg(abreg, 1); + r->where = UNW_WHERE_SPREL; + r->when = sr->region_start + min_t(int, t, sr->region_len - 1); + r->val = 4*spoff; +} + +#define UNW_DEC_BAD_CODE(code) printk(KERN_ERR "unwind: unknown code 0x%02x\n", \ + code); + +/* + * region headers: + */ +#define UNW_DEC_PROLOGUE_GR(fmt,r,m,gr,arg) desc_prologue(0,r,m,gr,arg) +#define UNW_DEC_PROLOGUE(fmt,b,r,arg) desc_prologue(b,r,0,32,arg) +/* + * prologue descriptors: + */ +#define UNW_DEC_ABI(fmt,a,c,arg) desc_abi(a,c,arg) +#define UNW_DEC_BR_GR(fmt,b,g,arg) desc_br_gr(b,g,arg) +#define UNW_DEC_BR_MEM(fmt,b,arg) desc_br_mem(b,arg) +#define UNW_DEC_FRGR_MEM(fmt,g,f,arg) desc_frgr_mem(g,f,arg) +#define UNW_DEC_FR_MEM(fmt,f,arg) desc_fr_mem(f,arg) +#define UNW_DEC_GR_GR(fmt,m,g,arg) desc_gr_gr(m,g,arg) +#define UNW_DEC_GR_MEM(fmt,m,arg) desc_gr_mem(m,arg) +#define UNW_DEC_MEM_STACK_F(fmt,t,s,arg) desc_mem_stack_f(t,s,arg) +#define UNW_DEC_MEM_STACK_V(fmt,t,arg) desc_mem_stack_v(t,arg) +#define UNW_DEC_REG_GR(fmt,r,d,arg) desc_reg_gr(r,d,arg) +#define UNW_DEC_REG_PSPREL(fmt,r,o,arg) desc_reg_psprel(r,o,arg) +#define UNW_DEC_REG_SPREL(fmt,r,o,arg) desc_reg_sprel(r,o,arg) +#define UNW_DEC_REG_WHEN(fmt,r,t,arg) desc_reg_when(r,t,arg) +#define UNW_DEC_PRIUNAT_WHEN_GR(fmt,t,arg) desc_reg_when(UNW_REG_PRI_UNAT_GR,t,arg) +#define UNW_DEC_PRIUNAT_WHEN_MEM(fmt,t,arg) desc_reg_when(UNW_REG_PRI_UNAT_MEM,t,arg) +#define UNW_DEC_PRIUNAT_GR(fmt,r,arg) desc_reg_gr(UNW_REG_PRI_UNAT_GR,r,arg) +#define UNW_DEC_PRIUNAT_PSPREL(fmt,o,arg) desc_reg_psprel(UNW_REG_PRI_UNAT_MEM,o,arg) +#define UNW_DEC_PRIUNAT_SPREL(fmt,o,arg) desc_reg_sprel(UNW_REG_PRI_UNAT_MEM,o,arg) +#define UNW_DEC_RP_BR(fmt,d,arg) desc_rp_br(d,arg) +#define UNW_DEC_SPILL_BASE(fmt,o,arg) desc_spill_base(o,arg) +#define UNW_DEC_SPILL_MASK(fmt,m,arg) (m = desc_spill_mask(m,arg)) +/* + * body descriptors: + */ +#define UNW_DEC_EPILOGUE(fmt,t,c,arg) desc_epilogue(t,c,arg) +#define UNW_DEC_COPY_STATE(fmt,l,arg) desc_copy_state(l,arg) +#define UNW_DEC_LABEL_STATE(fmt,l,arg) desc_label_state(l,arg) +/* + * general unwind descriptors: + */ +#define UNW_DEC_SPILL_REG_P(f,p,t,a,x,y,arg) desc_spill_reg_p(p,t,a,x,y,arg) +#define UNW_DEC_SPILL_REG(f,t,a,x,y,arg) desc_spill_reg_p(0,t,a,x,y,arg) +#define UNW_DEC_SPILL_PSPREL_P(f,p,t,a,o,arg) desc_spill_psprel_p(p,t,a,o,arg) +#define UNW_DEC_SPILL_PSPREL(f,t,a,o,arg) desc_spill_psprel_p(0,t,a,o,arg) +#define UNW_DEC_SPILL_SPREL_P(f,p,t,a,o,arg) desc_spill_sprel_p(p,t,a,o,arg) +#define UNW_DEC_SPILL_SPREL(f,t,a,o,arg) desc_spill_sprel_p(0,t,a,o,arg) +#define UNW_DEC_RESTORE_P(f,p,t,a,arg) desc_restore_p(p,t,a,arg) +#define UNW_DEC_RESTORE(f,t,a,arg) desc_restore_p(0,t,a,arg) + +#include "unwind_decoder.c" + + +/* Unwind scripts. */ + +static inline unw_hash_index_t +hash (unsigned long ip) +{ +# define hashmagic 0x9e3779b97f4a7c16UL /* based on (sqrt(5)/2-1)*2^64 */ + + return (ip >> 4)*hashmagic >> (64 - UNW_LOG_HASH_SIZE); +#undef hashmagic +} + +static inline long +cache_match (struct unw_script *script, unsigned long ip, unsigned long pr) +{ + read_lock(&script->lock); + if (ip == script->ip && ((pr ^ script->pr_val) & script->pr_mask) == 0) + /* keep the read lock... */ + return 1; + read_unlock(&script->lock); + return 0; +} + +static inline struct unw_script * +script_lookup (struct unw_frame_info *info) +{ + struct unw_script *script = unw.cache + info->hint; + unsigned short index; + unsigned long ip, pr; + + if (UNW_DEBUG_ON(0)) + return NULL; /* Always regenerate scripts in debug mode */ + + STAT(++unw.stat.cache.lookups); + + ip = info->ip; + pr = info->pr; + + if (cache_match(script, ip, pr)) { + STAT(++unw.stat.cache.hinted_hits); + return script; + } + + index = unw.hash[hash(ip)]; + if (index >= UNW_CACHE_SIZE) + return NULL; + + script = unw.cache + index; + while (1) { + if (cache_match(script, ip, pr)) { + /* update hint; no locking required as single-word writes are atomic */ + STAT(++unw.stat.cache.normal_hits); + unw.cache[info->prev_script].hint = script - unw.cache; + return script; + } + if (script->coll_chain >= UNW_HASH_SIZE) + return NULL; + script = unw.cache + script->coll_chain; + STAT(++unw.stat.cache.collision_chain_traversals); + } +} + +/* + * On returning, a write lock for the SCRIPT is still being held. + */ +static inline struct unw_script * +script_new (unsigned long ip) +{ + struct unw_script *script, *prev, *tmp; + unw_hash_index_t index; + unsigned short head; + + STAT(++unw.stat.script.news); + + /* + * Can't (easily) use cmpxchg() here because of ABA problem + * that is intrinsic in cmpxchg()... + */ + head = unw.lru_head; + script = unw.cache + head; + unw.lru_head = script->lru_chain; + + /* + * We'd deadlock here if we interrupted a thread that is holding a read lock on + * script->lock. Thus, if the write_trylock() fails, we simply bail out. The + * alternative would be to disable interrupts whenever we hold a read-lock, but + * that seems silly. + */ + if (!write_trylock(&script->lock)) + return NULL; + + /* re-insert script at the tail of the LRU chain: */ + unw.cache[unw.lru_tail].lru_chain = head; + unw.lru_tail = head; + + /* remove the old script from the hash table (if it's there): */ + if (script->ip) { + index = hash(script->ip); + tmp = unw.cache + unw.hash[index]; + prev = NULL; + while (1) { + if (tmp == script) { + if (prev) + prev->coll_chain = tmp->coll_chain; + else + unw.hash[index] = tmp->coll_chain; + break; + } else + prev = tmp; + if (tmp->coll_chain >= UNW_CACHE_SIZE) + /* old script wasn't in the hash-table */ + break; + tmp = unw.cache + tmp->coll_chain; + } + } + + /* enter new script in the hash table */ + index = hash(ip); + script->coll_chain = unw.hash[index]; + unw.hash[index] = script - unw.cache; + + script->ip = ip; /* set new IP while we're holding the locks */ + + STAT(if (script->coll_chain < UNW_CACHE_SIZE) ++unw.stat.script.collisions); + + script->flags = 0; + script->hint = 0; + script->count = 0; + return script; +} + +static void +script_finalize (struct unw_script *script, struct unw_state_record *sr) +{ + script->pr_mask = sr->pr_mask; + script->pr_val = sr->pr_val; + /* + * We could down-grade our write-lock on script->lock here but + * the rwlock API doesn't offer atomic lock downgrading, so + * we'll just keep the write-lock and release it later when + * we're done using the script. + */ +} + +static inline void +script_emit (struct unw_script *script, struct unw_insn insn) +{ + if (script->count >= UNW_MAX_SCRIPT_LEN) { + UNW_DPRINT(0, "unwind.%s: script exceeds maximum size of %u instructions!\n", + __FUNCTION__, UNW_MAX_SCRIPT_LEN); + return; + } + script->insn[script->count++] = insn; +} + +static inline void +emit_nat_info (struct unw_state_record *sr, int i, struct unw_script *script) +{ + struct unw_reg_info *r = sr->curr.reg + i; + enum unw_insn_opcode opc; + struct unw_insn insn; + unsigned long val = 0; + + switch (r->where) { + case UNW_WHERE_GR: + if (r->val >= 32) { + /* register got spilled to a stacked register */ + opc = UNW_INSN_SETNAT_TYPE; + val = UNW_NAT_REGSTK; + } else + /* register got spilled to a scratch register */ + opc = UNW_INSN_SETNAT_MEMSTK; + break; + + case UNW_WHERE_FR: + opc = UNW_INSN_SETNAT_TYPE; + val = UNW_NAT_VAL; + break; + + case UNW_WHERE_BR: + opc = UNW_INSN_SETNAT_TYPE; + val = UNW_NAT_NONE; + break; + + case UNW_WHERE_PSPREL: + case UNW_WHERE_SPREL: + opc = UNW_INSN_SETNAT_MEMSTK; + break; + + default: + UNW_DPRINT(0, "unwind.%s: don't know how to emit nat info for where = %u\n", + __FUNCTION__, r->where); + return; + } + insn.opc = opc; + insn.dst = unw.preg_index[i]; + insn.val = val; + script_emit(script, insn); +} + +static void +compile_reg (struct unw_state_record *sr, int i, struct unw_script *script) +{ + struct unw_reg_info *r = sr->curr.reg + i; + enum unw_insn_opcode opc; + unsigned long val, rval; + struct unw_insn insn; + long need_nat_info; + + if (r->where == UNW_WHERE_NONE || r->when >= sr->when_target) + return; + + opc = UNW_INSN_MOVE; + val = rval = r->val; + need_nat_info = (i >= UNW_REG_R4 && i <= UNW_REG_R7); + + switch (r->where) { + case UNW_WHERE_GR: + if (rval >= 32) { + opc = UNW_INSN_MOVE_STACKED; + val = rval - 32; + } else if (rval >= 4 && rval <= 7) { + if (need_nat_info) { + opc = UNW_INSN_MOVE2; + need_nat_info = 0; + } + val = unw.preg_index[UNW_REG_R4 + (rval - 4)]; + } else if (rval == 0) { + opc = UNW_INSN_MOVE_CONST; + val = 0; + } else { + /* register got spilled to a scratch register */ + opc = UNW_INSN_MOVE_SCRATCH; + val = pt_regs_off(rval); + } + break; + + case UNW_WHERE_FR: + if (rval <= 5) + val = unw.preg_index[UNW_REG_F2 + (rval - 2)]; + else if (rval >= 16 && rval <= 31) + val = unw.preg_index[UNW_REG_F16 + (rval - 16)]; + else { + opc = UNW_INSN_MOVE_SCRATCH; + if (rval <= 11) + val = offsetof(struct pt_regs, f6) + 16*(rval - 6); + else + UNW_DPRINT(0, "unwind.%s: kernel may not touch f%lu\n", + __FUNCTION__, rval); + } + break; + + case UNW_WHERE_BR: + if (rval >= 1 && rval <= 5) + val = unw.preg_index[UNW_REG_B1 + (rval - 1)]; + else { + opc = UNW_INSN_MOVE_SCRATCH; + if (rval == 0) + val = offsetof(struct pt_regs, b0); + else if (rval == 6) + val = offsetof(struct pt_regs, b6); + else + val = offsetof(struct pt_regs, b7); + } + break; + + case UNW_WHERE_SPREL: + opc = UNW_INSN_ADD_SP; + break; + + case UNW_WHERE_PSPREL: + opc = UNW_INSN_ADD_PSP; + break; + + default: + UNW_DPRINT(0, "unwind%s: register %u has unexpected `where' value of %u\n", + __FUNCTION__, i, r->where); + break; + } + insn.opc = opc; + insn.dst = unw.preg_index[i]; + insn.val = val; + script_emit(script, insn); + if (need_nat_info) + emit_nat_info(sr, i, script); + + if (i == UNW_REG_PSP) { + /* + * info->psp must contain the _value_ of the previous + * sp, not it's save location. We get this by + * dereferencing the value we just stored in + * info->psp: + */ + insn.opc = UNW_INSN_LOAD; + insn.dst = insn.val = unw.preg_index[UNW_REG_PSP]; + script_emit(script, insn); + } +} + +static inline const struct unw_table_entry * +lookup (struct unw_table *table, unsigned long rel_ip) +{ + const struct unw_table_entry *e = NULL; + unsigned long lo, hi, mid; + + /* do a binary search for right entry: */ + for (lo = 0, hi = table->length; lo < hi; ) { + mid = (lo + hi) / 2; + e = &table->array[mid]; + if (rel_ip < e->start_offset) + hi = mid; + else if (rel_ip >= e->end_offset) + lo = mid + 1; + else + break; + } + if (rel_ip < e->start_offset || rel_ip >= e->end_offset) + return NULL; + return e; +} + +/* + * Build an unwind script that unwinds from state OLD_STATE to the + * entrypoint of the function that called OLD_STATE. + */ +static inline struct unw_script * +build_script (struct unw_frame_info *info) +{ + const struct unw_table_entry *e = NULL; + struct unw_script *script = NULL; + struct unw_labeled_state *ls, *next; + unsigned long ip = info->ip; + struct unw_state_record sr; + struct unw_table *table; + struct unw_reg_info *r; + struct unw_insn insn; + u8 *dp, *desc_end; + u64 hdr; + int i; + STAT(unsigned long start, parse_start;) + + STAT(++unw.stat.script.builds; start = ia64_get_itc()); + + /* build state record */ + memset(&sr, 0, sizeof(sr)); + for (r = sr.curr.reg; r < sr.curr.reg + UNW_NUM_REGS; ++r) + r->when = UNW_WHEN_NEVER; + sr.pr_val = info->pr; + + UNW_DPRINT(3, "unwind.%s: ip 0x%lx\n", __FUNCTION__, ip); + script = script_new(ip); + if (!script) { + UNW_DPRINT(0, "unwind.%s: failed to create unwind script\n", __FUNCTION__); + STAT(unw.stat.script.build_time += ia64_get_itc() - start); + return NULL; + } + unw.cache[info->prev_script].hint = script - unw.cache; + + /* search the kernels and the modules' unwind tables for IP: */ + + STAT(parse_start = ia64_get_itc()); + + for (table = unw.tables; table; table = table->next) { + if (ip >= table->start && ip < table->end) { + e = lookup(table, ip - table->segment_base); + break; + } + } + if (!e) { + /* no info, return default unwinder (leaf proc, no mem stack, no saved regs) */ + UNW_DPRINT(1, "unwind.%s: no unwind info for ip=0x%lx (prev ip=0x%lx)\n", + __FUNCTION__, ip, unw.cache[info->prev_script].ip); + sr.curr.reg[UNW_REG_RP].where = UNW_WHERE_BR; + sr.curr.reg[UNW_REG_RP].when = -1; + sr.curr.reg[UNW_REG_RP].val = 0; + compile_reg(&sr, UNW_REG_RP, script); + script_finalize(script, &sr); + STAT(unw.stat.script.parse_time += ia64_get_itc() - parse_start); + STAT(unw.stat.script.build_time += ia64_get_itc() - start); + return script; + } + + sr.when_target = (3*((ip & ~0xfUL) - (table->segment_base + e->start_offset))/16 + + (ip & 0xfUL)); + hdr = *(u64 *) (table->segment_base + e->info_offset); + dp = (u8 *) (table->segment_base + e->info_offset + 8); + desc_end = dp + 8*UNW_LENGTH(hdr); + + while (!sr.done && dp < desc_end) + dp = unw_decode(dp, sr.in_body, &sr); + + if (sr.when_target > sr.epilogue_start) { + /* + * sp has been restored and all values on the memory stack below + * psp also have been restored. + */ + sr.curr.reg[UNW_REG_PSP].val = 0; + sr.curr.reg[UNW_REG_PSP].where = UNW_WHERE_NONE; + sr.curr.reg[UNW_REG_PSP].when = UNW_WHEN_NEVER; + for (r = sr.curr.reg; r < sr.curr.reg + UNW_NUM_REGS; ++r) + if ((r->where == UNW_WHERE_PSPREL && r->val <= 0x10) + || r->where == UNW_WHERE_SPREL) + { + r->val = 0; + r->where = UNW_WHERE_NONE; + r->when = UNW_WHEN_NEVER; + } + } + + script->flags = sr.flags; + + /* + * If RP did't get saved, generate entry for the return link + * register. + */ + if (sr.curr.reg[UNW_REG_RP].when >= sr.when_target) { + sr.curr.reg[UNW_REG_RP].where = UNW_WHERE_BR; + sr.curr.reg[UNW_REG_RP].when = -1; + sr.curr.reg[UNW_REG_RP].val = sr.return_link_reg; + UNW_DPRINT(1, "unwind.%s: using default for rp at ip=0x%lx where=%d val=0x%lx\n", + __FUNCTION__, ip, sr.curr.reg[UNW_REG_RP].where, + sr.curr.reg[UNW_REG_RP].val); + } + +#ifdef UNW_DEBUG + UNW_DPRINT(1, "unwind.%s: state record for func 0x%lx, t=%u:\n", + __FUNCTION__, table->segment_base + e->start_offset, sr.when_target); + for (r = sr.curr.reg; r < sr.curr.reg + UNW_NUM_REGS; ++r) { + if (r->where != UNW_WHERE_NONE || r->when != UNW_WHEN_NEVER) { + UNW_DPRINT(1, " %s <- ", unw.preg_name[r - sr.curr.reg]); + switch (r->where) { + case UNW_WHERE_GR: UNW_DPRINT(1, "r%lu", r->val); break; + case UNW_WHERE_FR: UNW_DPRINT(1, "f%lu", r->val); break; + case UNW_WHERE_BR: UNW_DPRINT(1, "b%lu", r->val); break; + case UNW_WHERE_SPREL: UNW_DPRINT(1, "[sp+0x%lx]", r->val); break; + case UNW_WHERE_PSPREL: UNW_DPRINT(1, "[psp+0x%lx]", r->val); break; + case UNW_WHERE_NONE: + UNW_DPRINT(1, "%s+0x%lx", unw.preg_name[r - sr.curr.reg], r->val); + break; + + default: + UNW_DPRINT(1, "BADWHERE(%d)", r->where); + break; + } + UNW_DPRINT(1, "\t\t%d\n", r->when); + } + } +#endif + + STAT(unw.stat.script.parse_time += ia64_get_itc() - parse_start); + + /* translate state record into unwinder instructions: */ + + /* + * First, set psp if we're dealing with a fixed-size frame; + * subsequent instructions may depend on this value. + */ + if (sr.when_target > sr.curr.reg[UNW_REG_PSP].when + && (sr.curr.reg[UNW_REG_PSP].where == UNW_WHERE_NONE) + && sr.curr.reg[UNW_REG_PSP].val != 0) { + /* new psp is sp plus frame size */ + insn.opc = UNW_INSN_ADD; + insn.dst = offsetof(struct unw_frame_info, psp)/8; + insn.val = sr.curr.reg[UNW_REG_PSP].val; /* frame size */ + script_emit(script, insn); + } + + /* determine where the primary UNaT is: */ + if (sr.when_target < sr.curr.reg[UNW_REG_PRI_UNAT_GR].when) + i = UNW_REG_PRI_UNAT_MEM; + else if (sr.when_target < sr.curr.reg[UNW_REG_PRI_UNAT_MEM].when) + i = UNW_REG_PRI_UNAT_GR; + else if (sr.curr.reg[UNW_REG_PRI_UNAT_MEM].when > sr.curr.reg[UNW_REG_PRI_UNAT_GR].when) + i = UNW_REG_PRI_UNAT_MEM; + else + i = UNW_REG_PRI_UNAT_GR; + + compile_reg(&sr, i, script); + + for (i = UNW_REG_BSP; i < UNW_NUM_REGS; ++i) + compile_reg(&sr, i, script); + + /* free labeled register states & stack: */ + + STAT(parse_start = ia64_get_itc()); + for (ls = sr.labeled_states; ls; ls = next) { + next = ls->next; + free_state_stack(&ls->saved_state); + free_labeled_state(ls); + } + free_state_stack(&sr.curr); + STAT(unw.stat.script.parse_time += ia64_get_itc() - parse_start); + + script_finalize(script, &sr); + STAT(unw.stat.script.build_time += ia64_get_itc() - start); + return script; +} + +/* + * Apply the unwinding actions represented by OPS and update SR to + * reflect the state that existed upon entry to the function that this + * unwinder represents. + */ +static inline void +run_script (struct unw_script *script, struct unw_frame_info *state) +{ + struct unw_insn *ip, *limit, next_insn; + unsigned long opc, dst, val, off; + unsigned long *s = (unsigned long *) state; + STAT(unsigned long start;) + + STAT(++unw.stat.script.runs; start = ia64_get_itc()); + state->flags = script->flags; + ip = script->insn; + limit = script->insn + script->count; + next_insn = *ip; + + while (ip++ < limit) { + opc = next_insn.opc; + dst = next_insn.dst; + val = next_insn.val; + next_insn = *ip; + + redo: + switch (opc) { + case UNW_INSN_ADD: + s[dst] += val; + break; + + case UNW_INSN_MOVE2: + if (!s[val]) + goto lazy_init; + s[dst+1] = s[val+1]; + s[dst] = s[val]; + break; + + case UNW_INSN_MOVE: + if (!s[val]) + goto lazy_init; + s[dst] = s[val]; + break; + + case UNW_INSN_MOVE_SCRATCH: + if (state->pt) { + s[dst] = (unsigned long) get_scratch_regs(state) + val; + } else { + s[dst] = 0; + UNW_DPRINT(0, "unwind.%s: no state->pt, dst=%ld, val=%ld\n", + __FUNCTION__, dst, val); + } + break; + + case UNW_INSN_MOVE_CONST: + if (val == 0) + s[dst] = (unsigned long) &unw.r0; + else { + s[dst] = 0; + UNW_DPRINT(0, "unwind.%s: UNW_INSN_MOVE_CONST bad val=%ld\n", + __FUNCTION__, val); + } + break; + + + case UNW_INSN_MOVE_STACKED: + s[dst] = (unsigned long) ia64_rse_skip_regs((unsigned long *)state->bsp, + val); + break; + + case UNW_INSN_ADD_PSP: + s[dst] = state->psp + val; + break; + + case UNW_INSN_ADD_SP: + s[dst] = state->sp + val; + break; + + case UNW_INSN_SETNAT_MEMSTK: + if (!state->pri_unat_loc) + state->pri_unat_loc = &state->sw->ar_unat; + /* register off. is a multiple of 8, so the least 3 bits (type) are 0 */ + s[dst+1] = ((unsigned long) state->pri_unat_loc - s[dst]) | UNW_NAT_MEMSTK; + break; + + case UNW_INSN_SETNAT_TYPE: + s[dst+1] = val; + break; + + case UNW_INSN_LOAD: +#ifdef UNW_DEBUG + if ((s[val] & (local_cpu_data->unimpl_va_mask | 0x7)) != 0 + || s[val] < TASK_SIZE) + { + UNW_DPRINT(0, "unwind.%s: rejecting bad psp=0x%lx\n", + __FUNCTION__, s[val]); + break; + } +#endif + s[dst] = *(unsigned long *) s[val]; + break; + } + } + STAT(unw.stat.script.run_time += ia64_get_itc() - start); + return; + + lazy_init: + off = unw.sw_off[val]; + s[val] = (unsigned long) state->sw + off; + if (off >= offsetof(struct switch_stack, r4) && off <= offsetof(struct switch_stack, r7)) + /* + * We're initializing a general register: init NaT info, too. Note that + * the offset is a multiple of 8 which gives us the 3 bits needed for + * the type field. + */ + s[val+1] = (offsetof(struct switch_stack, ar_unat) - off) | UNW_NAT_MEMSTK; + goto redo; +} + +static int +find_save_locs (struct unw_frame_info *info) +{ + int have_write_lock = 0; + struct unw_script *scr; + unsigned long flags = 0; + + if ((info->ip & (local_cpu_data->unimpl_va_mask | 0xf)) || info->ip < TASK_SIZE) { + /* don't let obviously bad addresses pollute the cache */ + /* FIXME: should really be level 0 but it occurs too often. KAO */ + UNW_DPRINT(1, "unwind.%s: rejecting bad ip=0x%lx\n", __FUNCTION__, info->ip); + info->rp_loc = NULL; + return -1; + } + + scr = script_lookup(info); + if (!scr) { + spin_lock_irqsave(&unw.lock, flags); + scr = build_script(info); + if (!scr) { + spin_unlock_irqrestore(&unw.lock, flags); + UNW_DPRINT(0, + "unwind.%s: failed to locate/build unwind script for ip %lx\n", + __FUNCTION__, info->ip); + return -1; + } + have_write_lock = 1; + } + info->hint = scr->hint; + info->prev_script = scr - unw.cache; + + run_script(scr, info); + + if (have_write_lock) { + write_unlock(&scr->lock); + spin_unlock_irqrestore(&unw.lock, flags); + } else + read_unlock(&scr->lock); + return 0; +} + +int +unw_unwind (struct unw_frame_info *info) +{ + unsigned long prev_ip, prev_sp, prev_bsp; + unsigned long ip, pr, num_regs; + STAT(unsigned long start, flags;) + int retval; + + STAT(local_irq_save(flags); ++unw.stat.api.unwinds; start = ia64_get_itc()); + + prev_ip = info->ip; + prev_sp = info->sp; + prev_bsp = info->bsp; + + /* restore the ip */ + if (!info->rp_loc) { + /* FIXME: should really be level 0 but it occurs too often. KAO */ + UNW_DPRINT(1, "unwind.%s: failed to locate return link (ip=0x%lx)!\n", + __FUNCTION__, info->ip); + STAT(unw.stat.api.unwind_time += ia64_get_itc() - start; local_irq_restore(flags)); + return -1; + } + ip = info->ip = *info->rp_loc; + if (ip < GATE_ADDR) { + UNW_DPRINT(2, "unwind.%s: reached user-space (ip=0x%lx)\n", __FUNCTION__, ip); + STAT(unw.stat.api.unwind_time += ia64_get_itc() - start; local_irq_restore(flags)); + return -1; + } + + /* restore the cfm: */ + if (!info->pfs_loc) { + UNW_DPRINT(0, "unwind.%s: failed to locate ar.pfs!\n", __FUNCTION__); + STAT(unw.stat.api.unwind_time += ia64_get_itc() - start; local_irq_restore(flags)); + return -1; + } + info->cfm_loc = info->pfs_loc; + + /* restore the bsp: */ + pr = info->pr; + num_regs = 0; + if ((info->flags & UNW_FLAG_INTERRUPT_FRAME)) { + info->pt = info->sp + 16; + if ((pr & (1UL << PRED_NON_SYSCALL)) != 0) + num_regs = *info->cfm_loc & 0x7f; /* size of frame */ + info->pfs_loc = + (unsigned long *) (info->pt + offsetof(struct pt_regs, ar_pfs)); + UNW_DPRINT(3, "unwind.%s: interrupt_frame pt 0x%lx\n", __FUNCTION__, info->pt); + } else + num_regs = (*info->cfm_loc >> 7) & 0x7f; /* size of locals */ + info->bsp = (unsigned long) ia64_rse_skip_regs((unsigned long *) info->bsp, -num_regs); + if (info->bsp < info->regstk.limit || info->bsp > info->regstk.top) { + UNW_DPRINT(0, "unwind.%s: bsp (0x%lx) out of range [0x%lx-0x%lx]\n", + __FUNCTION__, info->bsp, info->regstk.limit, info->regstk.top); + STAT(unw.stat.api.unwind_time += ia64_get_itc() - start; local_irq_restore(flags)); + return -1; + } + + /* restore the sp: */ + info->sp = info->psp; + if (info->sp < info->memstk.top || info->sp > info->memstk.limit) { + UNW_DPRINT(0, "unwind.%s: sp (0x%lx) out of range [0x%lx-0x%lx]\n", + __FUNCTION__, info->sp, info->memstk.top, info->memstk.limit); + STAT(unw.stat.api.unwind_time += ia64_get_itc() - start; local_irq_restore(flags)); + return -1; + } + + if (info->ip == prev_ip && info->sp == prev_sp && info->bsp == prev_bsp) { + UNW_DPRINT(0, "unwind.%s: ip, sp, bsp unchanged; stopping here (ip=0x%lx)\n", + __FUNCTION__, ip); + STAT(unw.stat.api.unwind_time += ia64_get_itc() - start; local_irq_restore(flags)); + return -1; + } + + /* as we unwind, the saved ar.unat becomes the primary unat: */ + info->pri_unat_loc = info->unat_loc; + + /* finally, restore the predicates: */ + unw_get_pr(info, &info->pr); + + retval = find_save_locs(info); + STAT(unw.stat.api.unwind_time += ia64_get_itc() - start; local_irq_restore(flags)); + return retval; +} +EXPORT_SYMBOL(unw_unwind); + +int +unw_unwind_to_user (struct unw_frame_info *info) +{ + unsigned long ip, sp; + + while (unw_unwind(info) >= 0) { + if (unw_get_rp(info, &ip) < 0) { + unw_get_ip(info, &ip); + UNW_DPRINT(0, "unwind.%s: failed to read return pointer (ip=0x%lx)\n", + __FUNCTION__, ip); + return -1; + } + unw_get_sp(info, &sp); + if (sp >= (unsigned long)info->task + IA64_STK_OFFSET) + break; + if (ip < FIXADDR_USER_END) + return 0; + } + unw_get_ip(info, &ip); + UNW_DPRINT(0, "unwind.%s: failed to unwind to user-level (ip=0x%lx)\n", __FUNCTION__, ip); + return -1; +} +EXPORT_SYMBOL(unw_unwind_to_user); + +static void +init_frame_info (struct unw_frame_info *info, struct task_struct *t, + struct switch_stack *sw, unsigned long stktop) +{ + unsigned long rbslimit, rbstop, stklimit; + STAT(unsigned long start, flags;) + + STAT(local_irq_save(flags); ++unw.stat.api.inits; start = ia64_get_itc()); + + /* + * Subtle stuff here: we _could_ unwind through the switch_stack frame but we + * don't want to do that because it would be slow as each preserved register would + * have to be processed. Instead, what we do here is zero out the frame info and + * start the unwind process at the function that created the switch_stack frame. + * When a preserved value in switch_stack needs to be accessed, run_script() will + * initialize the appropriate pointer on demand. + */ + memset(info, 0, sizeof(*info)); + + rbslimit = (unsigned long) t + IA64_RBS_OFFSET; + rbstop = sw->ar_bspstore; + if (rbstop - (unsigned long) t >= IA64_STK_OFFSET) + rbstop = rbslimit; + + stklimit = (unsigned long) t + IA64_STK_OFFSET; + if (stktop <= rbstop) + stktop = rbstop; + + info->regstk.limit = rbslimit; + info->regstk.top = rbstop; + info->memstk.limit = stklimit; + info->memstk.top = stktop; + info->task = t; + info->sw = sw; + info->sp = info->psp = stktop; + info->pr = sw->pr; + UNW_DPRINT(3, "unwind.%s:\n" + " task 0x%lx\n" + " rbs = [0x%lx-0x%lx)\n" + " stk = [0x%lx-0x%lx)\n" + " pr 0x%lx\n" + " sw 0x%lx\n" + " sp 0x%lx\n", + __FUNCTION__, (unsigned long) t, rbslimit, rbstop, stktop, stklimit, + info->pr, (unsigned long) info->sw, info->sp); + STAT(unw.stat.api.init_time += ia64_get_itc() - start; local_irq_restore(flags)); +} + +void +unw_init_from_interruption (struct unw_frame_info *info, struct task_struct *t, + struct pt_regs *pt, struct switch_stack *sw) +{ + unsigned long sof; + + init_frame_info(info, t, sw, pt->r12); + info->cfm_loc = &pt->cr_ifs; + info->unat_loc = &pt->ar_unat; + info->pfs_loc = &pt->ar_pfs; + sof = *info->cfm_loc & 0x7f; + info->bsp = (unsigned long) ia64_rse_skip_regs((unsigned long *) info->regstk.top, -sof); + info->ip = pt->cr_iip + ia64_psr(pt)->ri; + info->pt = (unsigned long) pt; + UNW_DPRINT(3, "unwind.%s:\n" + " bsp 0x%lx\n" + " sof 0x%lx\n" + " ip 0x%lx\n", + __FUNCTION__, info->bsp, sof, info->ip); + find_save_locs(info); +} + +void +unw_init_frame_info (struct unw_frame_info *info, struct task_struct *t, struct switch_stack *sw) +{ + unsigned long sol; + + init_frame_info(info, t, sw, (unsigned long) (sw + 1) - 16); + info->cfm_loc = &sw->ar_pfs; + sol = (*info->cfm_loc >> 7) & 0x7f; + info->bsp = (unsigned long) ia64_rse_skip_regs((unsigned long *) info->regstk.top, -sol); + info->ip = sw->b0; + UNW_DPRINT(3, "unwind.%s:\n" + " bsp 0x%lx\n" + " sol 0x%lx\n" + " ip 0x%lx\n", + __FUNCTION__, info->bsp, sol, info->ip); + find_save_locs(info); +} + +EXPORT_SYMBOL(unw_init_frame_info); + +void +unw_init_from_blocked_task (struct unw_frame_info *info, struct task_struct *t) +{ + struct switch_stack *sw = (struct switch_stack *) (t->thread.ksp + 16); + + UNW_DPRINT(1, "unwind.%s\n", __FUNCTION__); + unw_init_frame_info(info, t, sw); +} +EXPORT_SYMBOL(unw_init_from_blocked_task); + +static void +init_unwind_table (struct unw_table *table, const char *name, unsigned long segment_base, + unsigned long gp, const void *table_start, const void *table_end) +{ + const struct unw_table_entry *start = table_start, *end = table_end; + + table->name = name; + table->segment_base = segment_base; + table->gp = gp; + table->start = segment_base + start[0].start_offset; + table->end = segment_base + end[-1].end_offset; + table->array = start; + table->length = end - start; +} + +void * +unw_add_unwind_table (const char *name, unsigned long segment_base, unsigned long gp, + const void *table_start, const void *table_end) +{ + const struct unw_table_entry *start = table_start, *end = table_end; + struct unw_table *table; + unsigned long flags; + + if (end - start <= 0) { + UNW_DPRINT(0, "unwind.%s: ignoring attempt to insert empty unwind table\n", + __FUNCTION__); + return NULL; + } + + table = kmalloc(sizeof(*table), GFP_USER); + if (!table) + return NULL; + + init_unwind_table(table, name, segment_base, gp, table_start, table_end); + + spin_lock_irqsave(&unw.lock, flags); + { + /* keep kernel unwind table at the front (it's searched most commonly): */ + table->next = unw.tables->next; + unw.tables->next = table; + } + spin_unlock_irqrestore(&unw.lock, flags); + + return table; +} + +void +unw_remove_unwind_table (void *handle) +{ + struct unw_table *table, *prev; + struct unw_script *tmp; + unsigned long flags; + long index; + + if (!handle) { + UNW_DPRINT(0, "unwind.%s: ignoring attempt to remove non-existent unwind table\n", + __FUNCTION__); + return; + } + + table = handle; + if (table == &unw.kernel_table) { + UNW_DPRINT(0, "unwind.%s: sorry, freeing the kernel's unwind table is a " + "no-can-do!\n", __FUNCTION__); + return; + } + + spin_lock_irqsave(&unw.lock, flags); + { + /* first, delete the table: */ + + for (prev = (struct unw_table *) &unw.tables; prev; prev = prev->next) + if (prev->next == table) + break; + if (!prev) { + UNW_DPRINT(0, "unwind.%s: failed to find unwind table %p\n", + __FUNCTION__, (void *) table); + spin_unlock_irqrestore(&unw.lock, flags); + return; + } + prev->next = table->next; + } + spin_unlock_irqrestore(&unw.lock, flags); + + /* next, remove hash table entries for this table */ + + for (index = 0; index <= UNW_HASH_SIZE; ++index) { + tmp = unw.cache + unw.hash[index]; + if (unw.hash[index] >= UNW_CACHE_SIZE + || tmp->ip < table->start || tmp->ip >= table->end) + continue; + + write_lock(&tmp->lock); + { + if (tmp->ip >= table->start && tmp->ip < table->end) { + unw.hash[index] = tmp->coll_chain; + tmp->ip = 0; + } + } + write_unlock(&tmp->lock); + } + + kfree(table); +} + +static int __init +create_gate_table (void) +{ + const struct unw_table_entry *entry, *start, *end; + unsigned long *lp, segbase = GATE_ADDR; + size_t info_size, size; + char *info; + Elf64_Phdr *punw = NULL, *phdr = (Elf64_Phdr *) (GATE_ADDR + GATE_EHDR->e_phoff); + int i; + + for (i = 0; i < GATE_EHDR->e_phnum; ++i, ++phdr) + if (phdr->p_type == PT_IA_64_UNWIND) { + punw = phdr; + break; + } + + if (!punw) { + printk("%s: failed to find gate DSO's unwind table!\n", __FUNCTION__); + return 0; + } + + start = (const struct unw_table_entry *) punw->p_vaddr; + end = (struct unw_table_entry *) ((char *) start + punw->p_memsz); + size = 0; + + unw_add_unwind_table("linux-gate.so", segbase, 0, start, end); + + for (entry = start; entry < end; ++entry) + size += 3*8 + 8 + 8*UNW_LENGTH(*(u64 *) (segbase + entry->info_offset)); + size += 8; /* reserve space for "end of table" marker */ + + unw.gate_table = kmalloc(size, GFP_KERNEL); + if (!unw.gate_table) { + unw.gate_table_size = 0; + printk(KERN_ERR "%s: unable to create unwind data for gate page!\n", __FUNCTION__); + return 0; + } + unw.gate_table_size = size; + + lp = unw.gate_table; + info = (char *) unw.gate_table + size; + + for (entry = start; entry < end; ++entry, lp += 3) { + info_size = 8 + 8*UNW_LENGTH(*(u64 *) (segbase + entry->info_offset)); + info -= info_size; + memcpy(info, (char *) segbase + entry->info_offset, info_size); + + lp[0] = segbase + entry->start_offset; /* start */ + lp[1] = segbase + entry->end_offset; /* end */ + lp[2] = info - (char *) unw.gate_table; /* info */ + } + *lp = 0; /* end-of-table marker */ + return 0; +} + +__initcall(create_gate_table); + +void __init +unw_init (void) +{ + extern char __gp[]; + extern void unw_hash_index_t_is_too_narrow (void); + long i, off; + + if (8*sizeof(unw_hash_index_t) < UNW_LOG_HASH_SIZE) + unw_hash_index_t_is_too_narrow(); + + unw.sw_off[unw.preg_index[UNW_REG_PRI_UNAT_GR]] = SW(AR_UNAT); + unw.sw_off[unw.preg_index[UNW_REG_BSPSTORE]] = SW(AR_BSPSTORE); + unw.sw_off[unw.preg_index[UNW_REG_PFS]] = SW(AR_UNAT); + unw.sw_off[unw.preg_index[UNW_REG_RP]] = SW(B0); + unw.sw_off[unw.preg_index[UNW_REG_UNAT]] = SW(AR_UNAT); + unw.sw_off[unw.preg_index[UNW_REG_PR]] = SW(PR); + unw.sw_off[unw.preg_index[UNW_REG_LC]] = SW(AR_LC); + unw.sw_off[unw.preg_index[UNW_REG_FPSR]] = SW(AR_FPSR); + for (i = UNW_REG_R4, off = SW(R4); i <= UNW_REG_R7; ++i, off += 8) + unw.sw_off[unw.preg_index[i]] = off; + for (i = UNW_REG_B1, off = SW(B1); i <= UNW_REG_B5; ++i, off += 8) + unw.sw_off[unw.preg_index[i]] = off; + for (i = UNW_REG_F2, off = SW(F2); i <= UNW_REG_F5; ++i, off += 16) + unw.sw_off[unw.preg_index[i]] = off; + for (i = UNW_REG_F16, off = SW(F16); i <= UNW_REG_F31; ++i, off += 16) + unw.sw_off[unw.preg_index[i]] = off; + + for (i = 0; i < UNW_CACHE_SIZE; ++i) { + if (i > 0) + unw.cache[i].lru_chain = (i - 1); + unw.cache[i].coll_chain = -1; + rwlock_init(&unw.cache[i].lock); + } + unw.lru_head = UNW_CACHE_SIZE - 1; + unw.lru_tail = 0; + + init_unwind_table(&unw.kernel_table, "kernel", KERNEL_START, (unsigned long) __gp, + __start_unwind, __end_unwind); +} + +/* + * DEPRECATED DEPRECATED DEPRECATED DEPRECATED DEPRECATED DEPRECATED DEPRECATED + * + * This system call has been deprecated. The new and improved way to get + * at the kernel's unwind info is via the gate DSO. The address of the + * ELF header for this DSO is passed to user-level via AT_SYSINFO_EHDR. + * + * DEPRECATED DEPRECATED DEPRECATED DEPRECATED DEPRECATED DEPRECATED DEPRECATED + * + * This system call copies the unwind data into the buffer pointed to by BUF and returns + * the size of the unwind data. If BUF_SIZE is smaller than the size of the unwind data + * or if BUF is NULL, nothing is copied, but the system call still returns the size of the + * unwind data. + * + * The first portion of the unwind data contains an unwind table and rest contains the + * associated unwind info (in no particular order). The unwind table consists of a table + * of entries of the form: + * + * u64 start; (64-bit address of start of function) + * u64 end; (64-bit address of start of function) + * u64 info; (BUF-relative offset to unwind info) + * + * The end of the unwind table is indicated by an entry with a START address of zero. + * + * Please see the IA-64 Software Conventions and Runtime Architecture manual for details + * on the format of the unwind info. + * + * ERRORS + * EFAULT BUF points outside your accessible address space. + */ +asmlinkage long +sys_getunwind (void __user *buf, size_t buf_size) +{ + if (buf && buf_size >= unw.gate_table_size) + if (copy_to_user(buf, unw.gate_table, unw.gate_table_size) != 0) + return -EFAULT; + return unw.gate_table_size; +} diff --git a/arch/ia64/kernel/unwind_decoder.c b/arch/ia64/kernel/unwind_decoder.c new file mode 100644 index 000000000000..50ac2d82f9bf --- /dev/null +++ b/arch/ia64/kernel/unwind_decoder.c @@ -0,0 +1,459 @@ +/* + * Copyright (C) 2000 Hewlett-Packard Co + * Copyright (C) 2000 David Mosberger-Tang <davidm@hpl.hp.com> + * + * Generic IA-64 unwind info decoder. + * + * This file is used both by the Linux kernel and objdump. Please keep + * the two copies of this file in sync. + * + * You need to customize the decoder by defining the following + * macros/constants before including this file: + * + * Types: + * unw_word Unsigned integer type with at least 64 bits + * + * Register names: + * UNW_REG_BSP + * UNW_REG_BSPSTORE + * UNW_REG_FPSR + * UNW_REG_LC + * UNW_REG_PFS + * UNW_REG_PR + * UNW_REG_RNAT + * UNW_REG_PSP + * UNW_REG_RP + * UNW_REG_UNAT + * + * Decoder action macros: + * UNW_DEC_BAD_CODE(code) + * UNW_DEC_ABI(fmt,abi,context,arg) + * UNW_DEC_BR_GR(fmt,brmask,gr,arg) + * UNW_DEC_BR_MEM(fmt,brmask,arg) + * UNW_DEC_COPY_STATE(fmt,label,arg) + * UNW_DEC_EPILOGUE(fmt,t,ecount,arg) + * UNW_DEC_FRGR_MEM(fmt,grmask,frmask,arg) + * UNW_DEC_FR_MEM(fmt,frmask,arg) + * UNW_DEC_GR_GR(fmt,grmask,gr,arg) + * UNW_DEC_GR_MEM(fmt,grmask,arg) + * UNW_DEC_LABEL_STATE(fmt,label,arg) + * UNW_DEC_MEM_STACK_F(fmt,t,size,arg) + * UNW_DEC_MEM_STACK_V(fmt,t,arg) + * UNW_DEC_PRIUNAT_GR(fmt,r,arg) + * UNW_DEC_PRIUNAT_WHEN_GR(fmt,t,arg) + * UNW_DEC_PRIUNAT_WHEN_MEM(fmt,t,arg) + * UNW_DEC_PRIUNAT_WHEN_PSPREL(fmt,pspoff,arg) + * UNW_DEC_PRIUNAT_WHEN_SPREL(fmt,spoff,arg) + * UNW_DEC_PROLOGUE(fmt,body,rlen,arg) + * UNW_DEC_PROLOGUE_GR(fmt,rlen,mask,grsave,arg) + * UNW_DEC_REG_PSPREL(fmt,reg,pspoff,arg) + * UNW_DEC_REG_REG(fmt,src,dst,arg) + * UNW_DEC_REG_SPREL(fmt,reg,spoff,arg) + * UNW_DEC_REG_WHEN(fmt,reg,t,arg) + * UNW_DEC_RESTORE(fmt,t,abreg,arg) + * UNW_DEC_RESTORE_P(fmt,qp,t,abreg,arg) + * UNW_DEC_SPILL_BASE(fmt,pspoff,arg) + * UNW_DEC_SPILL_MASK(fmt,imaskp,arg) + * UNW_DEC_SPILL_PSPREL(fmt,t,abreg,pspoff,arg) + * UNW_DEC_SPILL_PSPREL_P(fmt,qp,t,abreg,pspoff,arg) + * UNW_DEC_SPILL_REG(fmt,t,abreg,x,ytreg,arg) + * UNW_DEC_SPILL_REG_P(fmt,qp,t,abreg,x,ytreg,arg) + * UNW_DEC_SPILL_SPREL(fmt,t,abreg,spoff,arg) + * UNW_DEC_SPILL_SPREL_P(fmt,qp,t,abreg,pspoff,arg) + */ + +static unw_word +unw_decode_uleb128 (unsigned char **dpp) +{ + unsigned shift = 0; + unw_word byte, result = 0; + unsigned char *bp = *dpp; + + while (1) + { + byte = *bp++; + result |= (byte & 0x7f) << shift; + if ((byte & 0x80) == 0) + break; + shift += 7; + } + *dpp = bp; + return result; +} + +static unsigned char * +unw_decode_x1 (unsigned char *dp, unsigned char code, void *arg) +{ + unsigned char byte1, abreg; + unw_word t, off; + + byte1 = *dp++; + t = unw_decode_uleb128 (&dp); + off = unw_decode_uleb128 (&dp); + abreg = (byte1 & 0x7f); + if (byte1 & 0x80) + UNW_DEC_SPILL_SPREL(X1, t, abreg, off, arg); + else + UNW_DEC_SPILL_PSPREL(X1, t, abreg, off, arg); + return dp; +} + +static unsigned char * +unw_decode_x2 (unsigned char *dp, unsigned char code, void *arg) +{ + unsigned char byte1, byte2, abreg, x, ytreg; + unw_word t; + + byte1 = *dp++; byte2 = *dp++; + t = unw_decode_uleb128 (&dp); + abreg = (byte1 & 0x7f); + ytreg = byte2; + x = (byte1 >> 7) & 1; + if ((byte1 & 0x80) == 0 && ytreg == 0) + UNW_DEC_RESTORE(X2, t, abreg, arg); + else + UNW_DEC_SPILL_REG(X2, t, abreg, x, ytreg, arg); + return dp; +} + +static unsigned char * +unw_decode_x3 (unsigned char *dp, unsigned char code, void *arg) +{ + unsigned char byte1, byte2, abreg, qp; + unw_word t, off; + + byte1 = *dp++; byte2 = *dp++; + t = unw_decode_uleb128 (&dp); + off = unw_decode_uleb128 (&dp); + + qp = (byte1 & 0x3f); + abreg = (byte2 & 0x7f); + + if (byte1 & 0x80) + UNW_DEC_SPILL_SPREL_P(X3, qp, t, abreg, off, arg); + else + UNW_DEC_SPILL_PSPREL_P(X3, qp, t, abreg, off, arg); + return dp; +} + +static unsigned char * +unw_decode_x4 (unsigned char *dp, unsigned char code, void *arg) +{ + unsigned char byte1, byte2, byte3, qp, abreg, x, ytreg; + unw_word t; + + byte1 = *dp++; byte2 = *dp++; byte3 = *dp++; + t = unw_decode_uleb128 (&dp); + + qp = (byte1 & 0x3f); + abreg = (byte2 & 0x7f); + x = (byte2 >> 7) & 1; + ytreg = byte3; + + if ((byte2 & 0x80) == 0 && byte3 == 0) + UNW_DEC_RESTORE_P(X4, qp, t, abreg, arg); + else + UNW_DEC_SPILL_REG_P(X4, qp, t, abreg, x, ytreg, arg); + return dp; +} + +static unsigned char * +unw_decode_r1 (unsigned char *dp, unsigned char code, void *arg) +{ + int body = (code & 0x20) != 0; + unw_word rlen; + + rlen = (code & 0x1f); + UNW_DEC_PROLOGUE(R1, body, rlen, arg); + return dp; +} + +static unsigned char * +unw_decode_r2 (unsigned char *dp, unsigned char code, void *arg) +{ + unsigned char byte1, mask, grsave; + unw_word rlen; + + byte1 = *dp++; + + mask = ((code & 0x7) << 1) | ((byte1 >> 7) & 1); + grsave = (byte1 & 0x7f); + rlen = unw_decode_uleb128 (&dp); + UNW_DEC_PROLOGUE_GR(R2, rlen, mask, grsave, arg); + return dp; +} + +static unsigned char * +unw_decode_r3 (unsigned char *dp, unsigned char code, void *arg) +{ + unw_word rlen; + + rlen = unw_decode_uleb128 (&dp); + UNW_DEC_PROLOGUE(R3, ((code & 0x3) == 1), rlen, arg); + return dp; +} + +static unsigned char * +unw_decode_p1 (unsigned char *dp, unsigned char code, void *arg) +{ + unsigned char brmask = (code & 0x1f); + + UNW_DEC_BR_MEM(P1, brmask, arg); + return dp; +} + +static unsigned char * +unw_decode_p2_p5 (unsigned char *dp, unsigned char code, void *arg) +{ + if ((code & 0x10) == 0) + { + unsigned char byte1 = *dp++; + + UNW_DEC_BR_GR(P2, ((code & 0xf) << 1) | ((byte1 >> 7) & 1), + (byte1 & 0x7f), arg); + } + else if ((code & 0x08) == 0) + { + unsigned char byte1 = *dp++, r, dst; + + r = ((code & 0x7) << 1) | ((byte1 >> 7) & 1); + dst = (byte1 & 0x7f); + switch (r) + { + case 0: UNW_DEC_REG_GR(P3, UNW_REG_PSP, dst, arg); break; + case 1: UNW_DEC_REG_GR(P3, UNW_REG_RP, dst, arg); break; + case 2: UNW_DEC_REG_GR(P3, UNW_REG_PFS, dst, arg); break; + case 3: UNW_DEC_REG_GR(P3, UNW_REG_PR, dst, arg); break; + case 4: UNW_DEC_REG_GR(P3, UNW_REG_UNAT, dst, arg); break; + case 5: UNW_DEC_REG_GR(P3, UNW_REG_LC, dst, arg); break; + case 6: UNW_DEC_RP_BR(P3, dst, arg); break; + case 7: UNW_DEC_REG_GR(P3, UNW_REG_RNAT, dst, arg); break; + case 8: UNW_DEC_REG_GR(P3, UNW_REG_BSP, dst, arg); break; + case 9: UNW_DEC_REG_GR(P3, UNW_REG_BSPSTORE, dst, arg); break; + case 10: UNW_DEC_REG_GR(P3, UNW_REG_FPSR, dst, arg); break; + case 11: UNW_DEC_PRIUNAT_GR(P3, dst, arg); break; + default: UNW_DEC_BAD_CODE(r); break; + } + } + else if ((code & 0x7) == 0) + UNW_DEC_SPILL_MASK(P4, dp, arg); + else if ((code & 0x7) == 1) + { + unw_word grmask, frmask, byte1, byte2, byte3; + + byte1 = *dp++; byte2 = *dp++; byte3 = *dp++; + grmask = ((byte1 >> 4) & 0xf); + frmask = ((byte1 & 0xf) << 16) | (byte2 << 8) | byte3; + UNW_DEC_FRGR_MEM(P5, grmask, frmask, arg); + } + else + UNW_DEC_BAD_CODE(code); + return dp; +} + +static unsigned char * +unw_decode_p6 (unsigned char *dp, unsigned char code, void *arg) +{ + int gregs = (code & 0x10) != 0; + unsigned char mask = (code & 0x0f); + + if (gregs) + UNW_DEC_GR_MEM(P6, mask, arg); + else + UNW_DEC_FR_MEM(P6, mask, arg); + return dp; +} + +static unsigned char * +unw_decode_p7_p10 (unsigned char *dp, unsigned char code, void *arg) +{ + unsigned char r, byte1, byte2; + unw_word t, size; + + if ((code & 0x10) == 0) + { + r = (code & 0xf); + t = unw_decode_uleb128 (&dp); + switch (r) + { + case 0: + size = unw_decode_uleb128 (&dp); + UNW_DEC_MEM_STACK_F(P7, t, size, arg); + break; + + case 1: UNW_DEC_MEM_STACK_V(P7, t, arg); break; + case 2: UNW_DEC_SPILL_BASE(P7, t, arg); break; + case 3: UNW_DEC_REG_SPREL(P7, UNW_REG_PSP, t, arg); break; + case 4: UNW_DEC_REG_WHEN(P7, UNW_REG_RP, t, arg); break; + case 5: UNW_DEC_REG_PSPREL(P7, UNW_REG_RP, t, arg); break; + case 6: UNW_DEC_REG_WHEN(P7, UNW_REG_PFS, t, arg); break; + case 7: UNW_DEC_REG_PSPREL(P7, UNW_REG_PFS, t, arg); break; + case 8: UNW_DEC_REG_WHEN(P7, UNW_REG_PR, t, arg); break; + case 9: UNW_DEC_REG_PSPREL(P7, UNW_REG_PR, t, arg); break; + case 10: UNW_DEC_REG_WHEN(P7, UNW_REG_LC, t, arg); break; + case 11: UNW_DEC_REG_PSPREL(P7, UNW_REG_LC, t, arg); break; + case 12: UNW_DEC_REG_WHEN(P7, UNW_REG_UNAT, t, arg); break; + case 13: UNW_DEC_REG_PSPREL(P7, UNW_REG_UNAT, t, arg); break; + case 14: UNW_DEC_REG_WHEN(P7, UNW_REG_FPSR, t, arg); break; + case 15: UNW_DEC_REG_PSPREL(P7, UNW_REG_FPSR, t, arg); break; + default: UNW_DEC_BAD_CODE(r); break; + } + } + else + { + switch (code & 0xf) + { + case 0x0: /* p8 */ + { + r = *dp++; + t = unw_decode_uleb128 (&dp); + switch (r) + { + case 1: UNW_DEC_REG_SPREL(P8, UNW_REG_RP, t, arg); break; + case 2: UNW_DEC_REG_SPREL(P8, UNW_REG_PFS, t, arg); break; + case 3: UNW_DEC_REG_SPREL(P8, UNW_REG_PR, t, arg); break; + case 4: UNW_DEC_REG_SPREL(P8, UNW_REG_LC, t, arg); break; + case 5: UNW_DEC_REG_SPREL(P8, UNW_REG_UNAT, t, arg); break; + case 6: UNW_DEC_REG_SPREL(P8, UNW_REG_FPSR, t, arg); break; + case 7: UNW_DEC_REG_WHEN(P8, UNW_REG_BSP, t, arg); break; + case 8: UNW_DEC_REG_PSPREL(P8, UNW_REG_BSP, t, arg); break; + case 9: UNW_DEC_REG_SPREL(P8, UNW_REG_BSP, t, arg); break; + case 10: UNW_DEC_REG_WHEN(P8, UNW_REG_BSPSTORE, t, arg); break; + case 11: UNW_DEC_REG_PSPREL(P8, UNW_REG_BSPSTORE, t, arg); break; + case 12: UNW_DEC_REG_SPREL(P8, UNW_REG_BSPSTORE, t, arg); break; + case 13: UNW_DEC_REG_WHEN(P8, UNW_REG_RNAT, t, arg); break; + case 14: UNW_DEC_REG_PSPREL(P8, UNW_REG_RNAT, t, arg); break; + case 15: UNW_DEC_REG_SPREL(P8, UNW_REG_RNAT, t, arg); break; + case 16: UNW_DEC_PRIUNAT_WHEN_GR(P8, t, arg); break; + case 17: UNW_DEC_PRIUNAT_PSPREL(P8, t, arg); break; + case 18: UNW_DEC_PRIUNAT_SPREL(P8, t, arg); break; + case 19: UNW_DEC_PRIUNAT_WHEN_MEM(P8, t, arg); break; + default: UNW_DEC_BAD_CODE(r); break; + } + } + break; + + case 0x1: + byte1 = *dp++; byte2 = *dp++; + UNW_DEC_GR_GR(P9, (byte1 & 0xf), (byte2 & 0x7f), arg); + break; + + case 0xf: /* p10 */ + byte1 = *dp++; byte2 = *dp++; + UNW_DEC_ABI(P10, byte1, byte2, arg); + break; + + case 0x9: + return unw_decode_x1 (dp, code, arg); + + case 0xa: + return unw_decode_x2 (dp, code, arg); + + case 0xb: + return unw_decode_x3 (dp, code, arg); + + case 0xc: + return unw_decode_x4 (dp, code, arg); + + default: + UNW_DEC_BAD_CODE(code); + break; + } + } + return dp; +} + +static unsigned char * +unw_decode_b1 (unsigned char *dp, unsigned char code, void *arg) +{ + unw_word label = (code & 0x1f); + + if ((code & 0x20) != 0) + UNW_DEC_COPY_STATE(B1, label, arg); + else + UNW_DEC_LABEL_STATE(B1, label, arg); + return dp; +} + +static unsigned char * +unw_decode_b2 (unsigned char *dp, unsigned char code, void *arg) +{ + unw_word t; + + t = unw_decode_uleb128 (&dp); + UNW_DEC_EPILOGUE(B2, t, (code & 0x1f), arg); + return dp; +} + +static unsigned char * +unw_decode_b3_x4 (unsigned char *dp, unsigned char code, void *arg) +{ + unw_word t, ecount, label; + + if ((code & 0x10) == 0) + { + t = unw_decode_uleb128 (&dp); + ecount = unw_decode_uleb128 (&dp); + UNW_DEC_EPILOGUE(B3, t, ecount, arg); + } + else if ((code & 0x07) == 0) + { + label = unw_decode_uleb128 (&dp); + if ((code & 0x08) != 0) + UNW_DEC_COPY_STATE(B4, label, arg); + else + UNW_DEC_LABEL_STATE(B4, label, arg); + } + else + switch (code & 0x7) + { + case 1: return unw_decode_x1 (dp, code, arg); + case 2: return unw_decode_x2 (dp, code, arg); + case 3: return unw_decode_x3 (dp, code, arg); + case 4: return unw_decode_x4 (dp, code, arg); + default: UNW_DEC_BAD_CODE(code); break; + } + return dp; +} + +typedef unsigned char *(*unw_decoder) (unsigned char *, unsigned char, void *); + +static unw_decoder unw_decode_table[2][8] = +{ + /* prologue table: */ + { + unw_decode_r1, /* 0 */ + unw_decode_r1, + unw_decode_r2, + unw_decode_r3, + unw_decode_p1, /* 4 */ + unw_decode_p2_p5, + unw_decode_p6, + unw_decode_p7_p10 + }, + { + unw_decode_r1, /* 0 */ + unw_decode_r1, + unw_decode_r2, + unw_decode_r3, + unw_decode_b1, /* 4 */ + unw_decode_b1, + unw_decode_b2, + unw_decode_b3_x4 + } +}; + +/* + * Decode one descriptor and return address of next descriptor. + */ +static inline unsigned char * +unw_decode (unsigned char *dp, int inside_body, void *arg) +{ + unw_decoder decoder; + unsigned char code; + + code = *dp++; + decoder = unw_decode_table[inside_body][code >> 5]; + dp = (*decoder) (dp, code, arg); + return dp; +} diff --git a/arch/ia64/kernel/unwind_i.h b/arch/ia64/kernel/unwind_i.h new file mode 100644 index 000000000000..96693a6ae370 --- /dev/null +++ b/arch/ia64/kernel/unwind_i.h @@ -0,0 +1,164 @@ +/* + * Copyright (C) 2000, 2002-2003 Hewlett-Packard Co + * David Mosberger-Tang <davidm@hpl.hp.com> + * + * Kernel unwind support. + */ + +#define UNW_VER(x) ((x) >> 48) +#define UNW_FLAG_MASK 0x0000ffff00000000 +#define UNW_FLAG_OSMASK 0x0000f00000000000 +#define UNW_FLAG_EHANDLER(x) ((x) & 0x0000000100000000L) +#define UNW_FLAG_UHANDLER(x) ((x) & 0x0000000200000000L) +#define UNW_LENGTH(x) ((x) & 0x00000000ffffffffL) + +enum unw_register_index { + /* primary unat: */ + UNW_REG_PRI_UNAT_GR, + UNW_REG_PRI_UNAT_MEM, + + /* register stack */ + UNW_REG_BSP, /* register stack pointer */ + UNW_REG_BSPSTORE, + UNW_REG_PFS, /* previous function state */ + UNW_REG_RNAT, + /* memory stack */ + UNW_REG_PSP, /* previous memory stack pointer */ + /* return pointer: */ + UNW_REG_RP, + + /* preserved registers: */ + UNW_REG_R4, UNW_REG_R5, UNW_REG_R6, UNW_REG_R7, + UNW_REG_UNAT, UNW_REG_PR, UNW_REG_LC, UNW_REG_FPSR, + UNW_REG_B1, UNW_REG_B2, UNW_REG_B3, UNW_REG_B4, UNW_REG_B5, + UNW_REG_F2, UNW_REG_F3, UNW_REG_F4, UNW_REG_F5, + UNW_REG_F16, UNW_REG_F17, UNW_REG_F18, UNW_REG_F19, + UNW_REG_F20, UNW_REG_F21, UNW_REG_F22, UNW_REG_F23, + UNW_REG_F24, UNW_REG_F25, UNW_REG_F26, UNW_REG_F27, + UNW_REG_F28, UNW_REG_F29, UNW_REG_F30, UNW_REG_F31, + UNW_NUM_REGS +}; + +struct unw_info_block { + u64 header; + u64 desc[0]; /* unwind descriptors */ + /* personality routine and language-specific data follow behind descriptors */ +}; + +struct unw_table { + struct unw_table *next; /* must be first member! */ + const char *name; + unsigned long gp; /* global pointer for this load-module */ + unsigned long segment_base; /* base for offsets in the unwind table entries */ + unsigned long start; + unsigned long end; + const struct unw_table_entry *array; + unsigned long length; +}; + +enum unw_where { + UNW_WHERE_NONE, /* register isn't saved at all */ + UNW_WHERE_GR, /* register is saved in a general register */ + UNW_WHERE_FR, /* register is saved in a floating-point register */ + UNW_WHERE_BR, /* register is saved in a branch register */ + UNW_WHERE_SPREL, /* register is saved on memstack (sp-relative) */ + UNW_WHERE_PSPREL, /* register is saved on memstack (psp-relative) */ + /* + * At the end of each prologue these locations get resolved to + * UNW_WHERE_PSPREL and UNW_WHERE_GR, respectively: + */ + UNW_WHERE_SPILL_HOME, /* register is saved in its spill home */ + UNW_WHERE_GR_SAVE /* register is saved in next general register */ +}; + +#define UNW_WHEN_NEVER 0x7fffffff + +struct unw_reg_info { + unsigned long val; /* save location: register number or offset */ + enum unw_where where; /* where the register gets saved */ + int when; /* when the register gets saved */ +}; + +struct unw_reg_state { + struct unw_reg_state *next; /* next (outer) element on state stack */ + struct unw_reg_info reg[UNW_NUM_REGS]; /* register save locations */ +}; + +struct unw_labeled_state { + struct unw_labeled_state *next; /* next labeled state (or NULL) */ + unsigned long label; /* label for this state */ + struct unw_reg_state saved_state; +}; + +struct unw_state_record { + unsigned int first_region : 1; /* is this the first region? */ + unsigned int done : 1; /* are we done scanning descriptors? */ + unsigned int any_spills : 1; /* got any register spills? */ + unsigned int in_body : 1; /* are we inside a body (as opposed to a prologue)? */ + unsigned long flags; /* see UNW_FLAG_* in unwind.h */ + + u8 *imask; /* imask of spill_mask record or NULL */ + unsigned long pr_val; /* predicate values */ + unsigned long pr_mask; /* predicate mask */ + long spill_offset; /* psp-relative offset for spill base */ + int region_start; + int region_len; + int epilogue_start; + int epilogue_count; + int when_target; + + u8 gr_save_loc; /* next general register to use for saving a register */ + u8 return_link_reg; /* branch register in which the return link is passed */ + + struct unw_labeled_state *labeled_states; /* list of all labeled states */ + struct unw_reg_state curr; /* current state */ +}; + +enum unw_nat_type { + UNW_NAT_NONE, /* NaT not represented */ + UNW_NAT_VAL, /* NaT represented by NaT value (fp reg) */ + UNW_NAT_MEMSTK, /* NaT value is in unat word at offset OFF */ + UNW_NAT_REGSTK /* NaT is in rnat */ +}; + +enum unw_insn_opcode { + UNW_INSN_ADD, /* s[dst] += val */ + UNW_INSN_ADD_PSP, /* s[dst] = (s.psp + val) */ + UNW_INSN_ADD_SP, /* s[dst] = (s.sp + val) */ + UNW_INSN_MOVE, /* s[dst] = s[val] */ + UNW_INSN_MOVE2, /* s[dst] = s[val]; s[dst+1] = s[val+1] */ + UNW_INSN_MOVE_STACKED, /* s[dst] = ia64_rse_skip(*s.bsp, val) */ + UNW_INSN_SETNAT_MEMSTK, /* s[dst+1].nat.type = MEMSTK; + s[dst+1].nat.off = *s.pri_unat - s[dst] */ + UNW_INSN_SETNAT_TYPE, /* s[dst+1].nat.type = val */ + UNW_INSN_LOAD, /* s[dst] = *s[val] */ + UNW_INSN_MOVE_SCRATCH, /* s[dst] = scratch reg "val" */ + UNW_INSN_MOVE_CONST, /* s[dst] = constant reg "val" */ +}; + +struct unw_insn { + unsigned int opc : 4; + unsigned int dst : 9; + signed int val : 19; +}; + +/* + * Preserved general static registers (r4-r7) give rise to two script + * instructions; everything else yields at most one instruction; at + * the end of the script, the psp gets popped, accounting for one more + * instruction. + */ +#define UNW_MAX_SCRIPT_LEN (UNW_NUM_REGS + 5) + +struct unw_script { + unsigned long ip; /* ip this script is for */ + unsigned long pr_mask; /* mask of predicates script depends on */ + unsigned long pr_val; /* predicate values this script is for */ + rwlock_t lock; + unsigned int flags; /* see UNW_FLAG_* in unwind.h */ + unsigned short lru_chain; /* used for least-recently-used chain */ + unsigned short coll_chain; /* used for hash collisions */ + unsigned short hint; /* hint for next script to try (or -1) */ + unsigned short count; /* number of instructions in script */ + struct unw_insn insn[UNW_MAX_SCRIPT_LEN]; +}; diff --git a/arch/ia64/kernel/vmlinux.lds.S b/arch/ia64/kernel/vmlinux.lds.S new file mode 100644 index 000000000000..b9f0db4c1b04 --- /dev/null +++ b/arch/ia64/kernel/vmlinux.lds.S @@ -0,0 +1,251 @@ +#include <linux/config.h> + +#include <asm/cache.h> +#include <asm/ptrace.h> +#include <asm/system.h> +#include <asm/pgtable.h> + +#define LOAD_OFFSET (KERNEL_START - KERNEL_TR_PAGE_SIZE) +#include <asm-generic/vmlinux.lds.h> + +OUTPUT_FORMAT("elf64-ia64-little") +OUTPUT_ARCH(ia64) +ENTRY(phys_start) +jiffies = jiffies_64; +PHDRS { + code PT_LOAD; + percpu PT_LOAD; + data PT_LOAD; +} +SECTIONS +{ + /* Sections to be discarded */ + /DISCARD/ : { + *(.exit.text) + *(.exit.data) + *(.exitcall.exit) + *(.IA_64.unwind.exit.text) + *(.IA_64.unwind_info.exit.text) + } + + v = PAGE_OFFSET; /* this symbol is here to make debugging easier... */ + phys_start = _start - LOAD_OFFSET; + + code : { } :code + . = KERNEL_START; + + _text = .; + _stext = .; + + .text : AT(ADDR(.text) - LOAD_OFFSET) + { + *(.text.ivt) + *(.text) + SCHED_TEXT + LOCK_TEXT + *(.gnu.linkonce.t*) + } + .text2 : AT(ADDR(.text2) - LOAD_OFFSET) + { *(.text2) } +#ifdef CONFIG_SMP + .text.lock : AT(ADDR(.text.lock) - LOAD_OFFSET) + { *(.text.lock) } +#endif + _etext = .; + + /* Read-only data */ + + /* Exception table */ + . = ALIGN(16); + __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) + { + __start___ex_table = .; + *(__ex_table) + __stop___ex_table = .; + } + + .data.patch.vtop : AT(ADDR(.data.patch.vtop) - LOAD_OFFSET) + { + __start___vtop_patchlist = .; + *(.data.patch.vtop) + __end___vtop_patchlist = .; + } + + .data.patch.mckinley_e9 : AT(ADDR(.data.patch.mckinley_e9) - LOAD_OFFSET) + { + __start___mckinley_e9_bundles = .; + *(.data.patch.mckinley_e9) + __end___mckinley_e9_bundles = .; + } + + /* Global data */ + _data = .; + +#if defined(CONFIG_IA64_GENERIC) + /* Machine Vector */ + . = ALIGN(16); + .machvec : AT(ADDR(.machvec) - LOAD_OFFSET) + { + machvec_start = .; + *(.machvec) + machvec_end = .; + } +#endif + + /* Unwind info & table: */ + . = ALIGN(8); + .IA_64.unwind_info : AT(ADDR(.IA_64.unwind_info) - LOAD_OFFSET) + { *(.IA_64.unwind_info*) } + .IA_64.unwind : AT(ADDR(.IA_64.unwind) - LOAD_OFFSET) + { + __start_unwind = .; + *(.IA_64.unwind*) + __end_unwind = .; + } + + RODATA + + .opd : AT(ADDR(.opd) - LOAD_OFFSET) + { *(.opd) } + + /* Initialization code and data: */ + + . = ALIGN(PAGE_SIZE); + __init_begin = .; + .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) + { + _sinittext = .; + *(.init.text) + _einittext = .; + } + + .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) + { *(.init.data) } + + .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) + { + __initramfs_start = .; + *(.init.ramfs) + __initramfs_end = .; + } + + . = ALIGN(16); + .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) + { + __setup_start = .; + *(.init.setup) + __setup_end = .; + } + .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) + { + __initcall_start = .; + *(.initcall1.init) + *(.initcall2.init) + *(.initcall3.init) + *(.initcall4.init) + *(.initcall5.init) + *(.initcall6.init) + *(.initcall7.init) + __initcall_end = .; + } + __con_initcall_start = .; + .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) + { *(.con_initcall.init) } + __con_initcall_end = .; + __security_initcall_start = .; + .security_initcall.init : AT(ADDR(.security_initcall.init) - LOAD_OFFSET) + { *(.security_initcall.init) } + __security_initcall_end = .; + . = ALIGN(PAGE_SIZE); + __init_end = .; + + /* The initial task and kernel stack */ + .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) + { *(.data.init_task) } + + .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) + { *(__special_page_section) + __start_gate_section = .; + *(.data.gate) + __stop_gate_section = .; + } + . = ALIGN(PAGE_SIZE); /* make sure the gate page doesn't expose kernel data */ + + .data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) + { *(.data.cacheline_aligned) } + + /* Per-cpu data: */ + percpu : { } :percpu + . = ALIGN(PERCPU_PAGE_SIZE); + __phys_per_cpu_start = .; + .data.percpu PERCPU_ADDR : AT(__phys_per_cpu_start - LOAD_OFFSET) + { + __per_cpu_start = .; + *(.data.percpu) + __per_cpu_end = .; + } + . = __phys_per_cpu_start + PERCPU_PAGE_SIZE; /* ensure percpu data fits into percpu page size */ + + data : { } :data + .data : AT(ADDR(.data) - LOAD_OFFSET) + { *(.data) *(.data1) *(.gnu.linkonce.d*) CONSTRUCTORS } + + . = ALIGN(16); /* gp must be 16-byte aligned for exc. table */ + .got : AT(ADDR(.got) - LOAD_OFFSET) + { *(.got.plt) *(.got) } + __gp = ADDR(.got) + 0x200000; + /* We want the small data sections together, so single-instruction offsets + can access them all, and initialized data all before uninitialized, so + we can shorten the on-disk segment size. */ + .sdata : AT(ADDR(.sdata) - LOAD_OFFSET) + { *(.sdata) *(.sdata1) *(.srdata) } + _edata = .; + _bss = .; + .sbss : AT(ADDR(.sbss) - LOAD_OFFSET) + { *(.sbss) *(.scommon) } + .bss : AT(ADDR(.bss) - LOAD_OFFSET) + { *(.bss) *(COMMON) } + + _end = .; + + code : { } :code + /* Stabs debugging sections. */ + .stab 0 : { *(.stab) } + .stabstr 0 : { *(.stabstr) } + .stab.excl 0 : { *(.stab.excl) } + .stab.exclstr 0 : { *(.stab.exclstr) } + .stab.index 0 : { *(.stab.index) } + .stab.indexstr 0 : { *(.stab.indexstr) } + /* DWARF debug sections. + Symbols in the DWARF debugging sections are relative to the beginning + of the section so we begin them at 0. */ + /* DWARF 1 */ + .debug 0 : { *(.debug) } + .line 0 : { *(.line) } + /* GNU DWARF 1 extensions */ + .debug_srcinfo 0 : { *(.debug_srcinfo) } + .debug_sfnames 0 : { *(.debug_sfnames) } + /* DWARF 1.1 and DWARF 2 */ + .debug_aranges 0 : { *(.debug_aranges) } + .debug_pubnames 0 : { *(.debug_pubnames) } + /* DWARF 2 */ + .debug_info 0 : { *(.debug_info) } + .debug_abbrev 0 : { *(.debug_abbrev) } + .debug_line 0 : { *(.debug_line) } + .debug_frame 0 : { *(.debug_frame) } + .debug_str 0 : { *(.debug_str) } + .debug_loc 0 : { *(.debug_loc) } + .debug_macinfo 0 : { *(.debug_macinfo) } + /* SGI/MIPS DWARF 2 extensions */ + .debug_weaknames 0 : { *(.debug_weaknames) } + .debug_funcnames 0 : { *(.debug_funcnames) } + .debug_typenames 0 : { *(.debug_typenames) } + .debug_varnames 0 : { *(.debug_varnames) } + /* These must appear regardless of . */ + /* Discard them for now since Intel SoftSDV cannot handle them. + .comment 0 : { *(.comment) } + .note 0 : { *(.note) } + */ + /DISCARD/ : { *(.comment) } + /DISCARD/ : { *(.note) } +} |