gaiser.org - c99shell

!C99Shell v. 1.0 pre-release build #13!
Software: Apache/2.0.54 (Unix) mod_perl/1.99_09 Perl/v5.8.0 mod_ssl/2.0.54 OpenSSL/0.9.7l DAV/2 FrontPage/5.0.2.2635 PHP/4.4.0 mod_gzip/2.0.26.1a uname -a: Linux snow.he.net 4.4.276-v2-mono-1 #1 SMP Wed Jul 21 11:21:17 PDT 2021 i686 uid=99(nobody) gid=98(nobody) groups=98(nobody) Safe-mode: OFF (not secure) /usr/src/linux-2.4.18-xfs-1.1/arch/ia64/kernel/ drwxr-xr-x Free 318.35 GB of 458.09 GB (69.49%) Encoder Tools Proc. FTP brute Sec. SQL PHP-code Update Feedback Self remove Logout

/*
 * This file contains the code to configure and read/write the ia64 performance
 * monitoring stuff.
 *
 * Originaly Written by Ganesh Venkitachalam, IBM Corp.
 * Modifications by David Mosberger-Tang, Hewlett-Packard Co.
 * Modifications by Stephane Eranian, Hewlett-Packard Co.
 * Copyright (C) 1999 Ganesh Venkitachalam <venkitac@us.ibm.com>
 * Copyright (C) 1999 David Mosberger-Tang <davidm@hpl.hp.com>
 * Copyright (C) 2000-2001 Stephane Eranian <eranian@hpl.hp.com>
 */

#include <linux/config.h>
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/interrupt.h>
#include <linux/smp_lock.h>
#include <linux/proc_fs.h>
#include <linux/init.h>
#include <linux/vmalloc.h>
#include <linux/wrapper.h>
#include <linux/mm.h>

#include <asm/bitops.h>
#include <asm/efi.h>
#include <asm/errno.h>
#include <asm/hw_irq.h>
#include <asm/page.h>
#include <asm/pal.h>
#include <asm/perfmon.h>
#include <asm/pgtable.h>
#include <asm/processor.h>
#include <asm/signal.h>
#include <asm/system.h>
#include <asm/system.h>
#include <asm/uaccess.h>
#include <asm/delay.h> /* for ia64_get_itc() */

#ifdef CONFIG_PERFMON

#define PFM_VERSION        "0.3"
#define PFM_SMPL_HDR_VERSION    1

#define PMU_FIRST_COUNTER    4    /* first generic counter */

#define PFM_WRITE_PMCS        0xa0
#define PFM_WRITE_PMDS        0xa1
#define PFM_READ_PMDS        0xa2
#define PFM_STOP        0xa3
#define PFM_START        0xa4
#define PFM_ENABLE        0xa5    /* unfreeze only */
#define PFM_DISABLE        0xa6    /* freeze only */
#define PFM_RESTART        0xcf
#define PFM_CREATE_CONTEXT    0xa7
#define PFM_DESTROY_CONTEXT    0xa8
/*
 * Those 2 are just meant for debugging. I considered using sysctl() for
 * that but it is a little bit too pervasive. This solution is at least
 * self-contained.
 */
#define PFM_DEBUG_ON        0xe0
#define PFM_DEBUG_OFF        0xe1

#define PFM_DEBUG_BASE        PFM_DEBUG_ON


/*
 * perfmon API flags
 */
#define PFM_FL_INHERIT_NONE     0x00    /* never inherit a context across fork (default) */
#define PFM_FL_INHERIT_ONCE     0x01    /* clone pfm_context only once across fork() */
#define PFM_FL_INHERIT_ALL     0x02    /* always clone pfm_context across fork() */
#define PFM_FL_SMPL_OVFL_NOBLOCK 0x04    /* do not block on sampling buffer overflow */
#define PFM_FL_SYSTEM_WIDE     0x08    /* create a system wide context */
#define PFM_FL_EXCL_INTR     0x10    /* exclude interrupt from system wide monitoring */

/*
 * PMC API flags
 */
#define PFM_REGFL_OVFL_NOTIFY    1        /* send notification on overflow */

/*
 * Private flags and masks
 */
#define PFM_FL_INHERIT_MASK    (PFM_FL_INHERIT_NONE|PFM_FL_INHERIT_ONCE|PFM_FL_INHERIT_ALL)

#ifdef CONFIG_SMP
#define cpu_is_online(i) (cpu_online_map & (1UL << i))
#else
#define cpu_is_online(i)    1
#endif

#define PMC_IS_IMPL(i)        (i < pmu_conf.num_pmcs && pmu_conf.impl_regs[i>>6] & (1<< (i&~(64-1))))
#define PMD_IS_IMPL(i)        (i < pmu_conf.num_pmds &&  pmu_conf.impl_regs[4+(i>>6)] & (1<< (i&~(64-1))))
#define PMD_IS_COUNTER(i)    (i>=PMU_FIRST_COUNTER && i < (PMU_FIRST_COUNTER+pmu_conf.max_counters))
#define PMC_IS_COUNTER(i)    (i>=PMU_FIRST_COUNTER && i < (PMU_FIRST_COUNTER+pmu_conf.max_counters))

/* This is the Itanium-specific PMC layout for counter config */
typedef struct {
    unsigned long pmc_plm:4;    /* privilege level mask */
    unsigned long pmc_ev:1;        /* external visibility */
    unsigned long pmc_oi:1;        /* overflow interrupt */
    unsigned long pmc_pm:1;        /* privileged monitor */
    unsigned long pmc_ig1:1;    /* reserved */
    unsigned long pmc_es:7;        /* event select */
    unsigned long pmc_ig2:1;    /* reserved */
    unsigned long pmc_umask:4;    /* unit mask */
    unsigned long pmc_thres:3;    /* threshold */
    unsigned long pmc_ig3:1;    /* reserved (missing from table on p6-17) */
    unsigned long pmc_ism:2;    /* instruction set mask */
    unsigned long pmc_ig4:38;    /* reserved */
} pmc_counter_reg_t;

/* test for EAR/BTB configuration */
#define PMU_DEAR_EVENT    0x67
#define PMU_IEAR_EVENT    0x23
#define PMU_BTB_EVENT    0x11

#define PMC_IS_DEAR(a)        (((pmc_counter_reg_t *)(a))->pmc_es == PMU_DEAR_EVENT)
#define PMC_IS_IEAR(a)        (((pmc_counter_reg_t *)(a))->pmc_es == PMU_IEAR_EVENT)
#define PMC_IS_BTB(a)        (((pmc_counter_reg_t *)(a))->pmc_es == PMU_BTB_EVENT)

/*
 * This header is at the beginning of the sampling buffer returned to the user.
 * It is exported as Read-Only at this point. It is directly followed with the
 * first record.
 */
typedef struct {
    int        hdr_version;        /* could be used to differentiate formats */
    int        hdr_reserved;
    unsigned long    hdr_entry_size;        /* size of one entry in bytes */
    unsigned long    hdr_count;        /* how many valid entries */
    unsigned long    hdr_pmds;        /* which pmds are recorded */
} perfmon_smpl_hdr_t;

/*
 * Header entry in the buffer as a header as follows.
 * The header is directly followed with the PMDS to saved in increasing index order:
 * PMD4, PMD5, .... How many PMDs are present is determined by the tool which must
 * keep track of it when generating the final trace file.
 */
typedef struct {
    int        pid;        /* identification of process */
    int        cpu;        /* which cpu was used */
    unsigned long    rate;        /* initial value of this counter */
    unsigned long    stamp;        /* timestamp */
    unsigned long    ip;        /* where did the overflow interrupt happened */
    unsigned long    regs;        /* which registers overflowed (up to 64)*/
} perfmon_smpl_entry_t;

/*
 * There is one such data structure per perfmon context. It is used to describe the
 * sampling buffer. It is to be shared among siblings whereas the pfm_context isn't.
 * Therefore we maintain a refcnt which is incremented on fork().
 * This buffer is private to the kernel only the actual sampling buffer including its
 * header are exposed to the user. This construct allows us to export the buffer read-write,
 * if needed, without worrying about security problems.
 */
typedef struct {
    atomic_t        psb_refcnt;    /* how many users for the buffer */
    int            reserved;
    void            *psb_addr;    /* points to location of first entry */
    unsigned long        psb_entries;    /* maximum number of entries */
    unsigned long        psb_size;    /* aligned size of buffer */
    unsigned long        psb_index;    /* next free entry slot */
    unsigned long        psb_entry_size;    /* size of each entry including entry header */
    perfmon_smpl_hdr_t    *psb_hdr;    /* points to sampling buffer header */
} pfm_smpl_buffer_desc_t;


/*
 * This structure is initialized at boot time and contains
 * a description of the PMU main characteristic as indicated
 * by PAL
 */
typedef struct {
    unsigned long pfm_is_disabled;    /* indicates if perfmon is working properly */
    unsigned long perf_ovfl_val;    /* overflow value for generic counters   */
    unsigned long max_counters;    /* upper limit on counter pair (PMC/PMD) */
    unsigned long num_pmcs ;    /* highest PMC implemented (may have holes) */
    unsigned long num_pmds;        /* highest PMD implemented (may have holes) */
    unsigned long impl_regs[16];    /* buffer used to hold implememted PMC/PMD mask */
} pmu_config_t;

#define PERFMON_IS_DISABLED() pmu_conf.pfm_is_disabled

typedef struct {
    __u64        val;        /* virtual 64bit counter value */
    __u64        ival;        /* initial value from user */
    __u64        smpl_rval;    /* reset value on sampling overflow */
    __u64        ovfl_rval;    /* reset value on overflow */
    int        flags;        /* notify/do not notify */
} pfm_counter_t;
#define PMD_OVFL_NOTIFY(ctx, i)    ((ctx)->ctx_pmds[i].flags &  PFM_REGFL_OVFL_NOTIFY)

/*
 * perfmon context. One per process, is cloned on fork() depending on inheritance flags
 */
typedef struct {
    unsigned int inherit:2;    /* inherit mode */
    unsigned int noblock:1;    /* block/don't block on overflow with notification */
    unsigned int system:1;    /* do system wide monitoring */
    unsigned int frozen:1;    /* pmu must be kept frozen on ctxsw in */
    unsigned int exclintr:1;/* exlcude interrupts from system wide monitoring */
    unsigned int reserved:26;
} pfm_context_flags_t;

typedef struct pfm_context {

    pfm_smpl_buffer_desc_t    *ctx_smpl_buf;        /* sampling buffer descriptor, if any */
    unsigned long        ctx_dear_counter;    /* which PMD holds D-EAR */
    unsigned long        ctx_iear_counter;    /* which PMD holds I-EAR */
    unsigned long        ctx_btb_counter;    /* which PMD holds BTB */

    spinlock_t        ctx_notify_lock;
    pfm_context_flags_t    ctx_flags;        /* block/noblock */
    int            ctx_notify_sig;        /* XXX: SIGPROF or other */
    struct task_struct    *ctx_notify_task;    /* who to notify on overflow */
    struct task_struct    *ctx_creator;        /* pid of creator (debug) */

    unsigned long        ctx_ovfl_regs;        /* which registers just overflowed (notification) */
    unsigned long        ctx_smpl_regs;        /* which registers to record on overflow */

    struct semaphore    ctx_restart_sem;     /* use for blocking notification mode */

    unsigned long        ctx_used_pmds[4];     /* bitmask of used PMD (speedup ctxsw) */
    unsigned long        ctx_used_pmcs[4];     /* bitmask of used PMC (speedup ctxsw) */

    pfm_counter_t        ctx_pmds[IA64_NUM_PMD_COUNTERS]; /* XXX: size should be dynamic */

} pfm_context_t;

#define CTX_USED_PMD(ctx,n) (ctx)->ctx_used_pmds[(n)>>6] |= 1<< ((n) % 64)
#define CTX_USED_PMC(ctx,n) (ctx)->ctx_used_pmcs[(n)>>6] |= 1<< ((n) % 64)

#define ctx_fl_inherit    ctx_flags.inherit
#define ctx_fl_noblock    ctx_flags.noblock
#define ctx_fl_system    ctx_flags.system
#define ctx_fl_frozen    ctx_flags.frozen
#define ctx_fl_exclintr    ctx_flags.exclintr

#define CTX_OVFL_NOBLOCK(c)    ((c)->ctx_fl_noblock == 1)
#define CTX_INHERIT_MODE(c)    ((c)->ctx_fl_inherit)
#define CTX_HAS_SMPL(c)        ((c)->ctx_smpl_buf != NULL)

static pmu_config_t pmu_conf;

/* for debug only */
static int pfm_debug=0;    /* 0= nodebug, >0= debug output on */

#define DBprintk(a) \
    do { \
        if (pfm_debug >0) { printk(__FUNCTION__" %d: ", __LINE__); printk a; } \
    } while (0);

static void ia64_reset_pmu(void);

/*
 * structure used to pass information between the interrupt handler
 * and the tasklet.
 */
typedef struct {
    pid_t        to_pid;        /* which process to notify */
    pid_t        from_pid;    /* which process is source of overflow */
    int        sig;        /* with which signal */
    unsigned long    bitvect;    /* which counters have overflowed */
} notification_info_t;


typedef struct {
    unsigned long pfs_proc_sessions;
    unsigned long pfs_sys_session; /* can only be 0/1 */
    unsigned long pfs_dfl_dcr;    /* XXX: hack */
    unsigned int  pfs_pp;
} pfm_session_t;

struct {
    struct task_struct *owner;
} ____cacheline_aligned pmu_owners[NR_CPUS];


/* 
 * helper macros
 */
#define SET_PMU_OWNER(t)    do { pmu_owners[smp_processor_id()].owner = (t); } while(0);
#define PMU_OWNER()        pmu_owners[smp_processor_id()].owner

#ifdef CONFIG_SMP
#define PFM_CAN_DO_LAZY()    (smp_num_cpus==1 && pfs_info.pfs_sys_session==0)
#else
#define PFM_CAN_DO_LAZY()    (pfs_info.pfs_sys_session==0)
#endif

static void pfm_lazy_save_regs (struct task_struct *ta);

/* for debug only */
static struct proc_dir_entry *perfmon_dir;

/*
 * XXX: hack to indicate that a system wide monitoring session is active
 */
static pfm_session_t pfs_info;

/*
 * finds the number of PM(C|D) registers given
 * the bitvector returned by PAL
 */
static unsigned long __init
find_num_pm_regs(long *buffer)
{
    int i=3; /* 4 words/per bitvector */

    /* start from the most significant word */
    while (i>=0 && buffer[i] == 0 ) i--;
    if (i< 0) {
        printk(KERN_ERR "perfmon: No bit set in pm_buffer\n");
        return 0;
    }
    return 1+ ia64_fls(buffer[i]) + 64 * i;
}


/*
 * Generates a unique (per CPU) timestamp
 */
static inline unsigned long
perfmon_get_stamp(void)
{
    /*
     * XXX: maybe find something more efficient
     */
    return ia64_get_itc();
}

/* Given PGD from the address space's page table, return the kernel
 * virtual mapping of the physical memory mapped at ADR.
 */
static inline unsigned long
uvirt_to_kva(pgd_t *pgd, unsigned long adr)
{
    unsigned long ret = 0UL;
    pmd_t *pmd;
    pte_t *ptep, pte;

    if (!pgd_none(*pgd)) {
        pmd = pmd_offset(pgd, adr);
        if (!pmd_none(*pmd)) {
            ptep = pte_offset(pmd, adr);
            pte = *ptep;
            if (pte_present(pte)) {
                ret = (unsigned long) page_address(pte_page(pte));
                ret |= (adr & (PAGE_SIZE - 1));
            }
        }
    }
    DBprintk(("uv2kva(%lx-->%lx)\n", adr, ret));
    return ret;
}


/* Here we want the physical address of the memory.
 * This is used when initializing the contents of the
 * area and marking the pages as reserved.
 */
static inline unsigned long
kvirt_to_pa(unsigned long adr)
{
    __u64 pa = ia64_tpa(adr);
    DBprintk(("kv2pa(%lx-->%lx)\n", adr, pa));
    return pa;
}

static void *
rvmalloc(unsigned long size)
{
    void *mem;
    unsigned long adr, page;

    /* XXX: may have to revisit this part because
     * vmalloc() does not necessarily return a page-aligned buffer.
     * This maybe a security problem when mapped at user level
     */
    mem=vmalloc(size);
    if (mem) {
        memset(mem, 0, size); /* Clear the ram out, no junk to the user */
        adr=(unsigned long) mem;
        while (size > 0) {
            page = kvirt_to_pa(adr);
            mem_map_reserve(virt_to_page(__va(page)));
            adr+=PAGE_SIZE;
            size-=PAGE_SIZE;
        }
    }
    return mem;
}

static void
rvfree(void *mem, unsigned long size)
{
    unsigned long adr, page;

    if (mem) {
        adr=(unsigned long) mem;
        while (size > 0) {
            page = kvirt_to_pa(adr);
            mem_map_unreserve(virt_to_page(__va(page)));
            adr+=PAGE_SIZE;
            size-=PAGE_SIZE;
        }
        vfree(mem);
    }
}

static pfm_context_t *
pfm_context_alloc(void)
{
    pfm_context_t *pfc;

    /* allocate context descriptor */
    pfc = vmalloc(sizeof(*pfc));
    if (pfc) memset(pfc, 0, sizeof(*pfc));

    return pfc;
}

static void
pfm_context_free(pfm_context_t *pfc)
{
    if (pfc) vfree(pfc);
}

static int
pfm_remap_buffer(unsigned long buf, unsigned long addr, unsigned long size)
{
    unsigned long page;

    while (size > 0) {
        page = kvirt_to_pa(buf);

        if (remap_page_range(addr, page, PAGE_SIZE, PAGE_SHARED)) return -ENOMEM;

        addr  += PAGE_SIZE;
        buf   += PAGE_SIZE;
        size  -= PAGE_SIZE;
    }
    return 0;
}

/*
 * counts the number of PMDS to save per entry.
 * This code is generic enough to accomodate more than 64 PMDS when they become available
 */
static unsigned long
pfm_smpl_entry_size(unsigned long *which, unsigned long size)
{
    unsigned long res = 0;
    int i;

    for (i=0; i < size; i++, which++) res += hweight64(*which);

    DBprintk((" res=%ld\n", res));

    return res;
}

/*
 * Allocates the sampling buffer and remaps it into caller's address space
 */
static int
pfm_smpl_buffer_alloc(pfm_context_t *ctx, unsigned long which_pmds, unsigned long entries, void **user_addr)
{
    struct mm_struct *mm = current->mm;
    struct vm_area_struct *vma;
    unsigned long addr, size, regcount;
    void *smpl_buf;
    pfm_smpl_buffer_desc_t *psb;

    regcount = pfm_smpl_entry_size(&which_pmds, 1);

    /* note that regcount might be 0, in this case only the header for each
     * entry will be recorded.
     */

    /*
     * 1 buffer hdr and for each entry a header + regcount PMDs to save
     */
    size = PAGE_ALIGN(  sizeof(perfmon_smpl_hdr_t)
              + entries * (sizeof(perfmon_smpl_entry_t) + regcount*sizeof(u64)));
    /*
     * check requested size to avoid Denial-of-service attacks
     * XXX: may have to refine this test
     */
    if (size > current->rlim[RLIMIT_MEMLOCK].rlim_cur) return -EAGAIN;

    /* find some free area in address space */
    addr = get_unmapped_area(NULL, 0, size, 0, MAP_PRIVATE);
    if (!addr) goto no_addr;

    DBprintk((" entries=%ld aligned size=%ld, unmapped @0x%lx\n", entries, size, addr));

    /* allocate vma */
    vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
    if (!vma) goto no_vma;

    /* XXX: see rvmalloc() for page alignment problem */
    smpl_buf = rvmalloc(size);
    if (smpl_buf == NULL) goto no_buffer;

    DBprintk((" smpl_buf @%p\n", smpl_buf));

    if (pfm_remap_buffer((unsigned long)smpl_buf, addr, size)) goto cant_remap;

    /* allocate sampling buffer descriptor now */
    psb = vmalloc(sizeof(*psb));
    if (psb == NULL) goto no_buffer_desc;

    /* start with something clean */
    memset(smpl_buf, 0x0, size);

    psb->psb_hdr     = smpl_buf;
    psb->psb_addr    = (char *)smpl_buf+sizeof(perfmon_smpl_hdr_t); /* first entry */
    psb->psb_size    = size; /* aligned size */
    psb->psb_index   = 0;
    psb->psb_entries = entries;

    atomic_set(&psb->psb_refcnt, 1);

    psb->psb_entry_size = sizeof(perfmon_smpl_entry_t) + regcount*sizeof(u64);

    DBprintk((" psb @%p entry_size=%ld hdr=%p addr=%p\n", (void *)psb,psb->psb_entry_size, (void *)psb->psb_hdr, (void *)psb->psb_addr));

    /* initialize some of the fields of header */
    psb->psb_hdr->hdr_version    = PFM_SMPL_HDR_VERSION;
    psb->psb_hdr->hdr_entry_size = sizeof(perfmon_smpl_entry_t)+regcount*sizeof(u64);
    psb->psb_hdr->hdr_pmds         = which_pmds;

    /* store which PMDS to record */
    ctx->ctx_smpl_regs = which_pmds;

    /* link to perfmon context */
    ctx->ctx_smpl_buf  = psb;

    /*
     * initialize the vma for the sampling buffer
     */
    vma->vm_mm      = mm;
    vma->vm_start      = addr;
    vma->vm_end      = addr + size;
    vma->vm_flags      = VM_READ|VM_MAYREAD;
    vma->vm_page_prot = PAGE_READONLY; /* XXX may need to change */
    vma->vm_ops      = NULL;
    vma->vm_pgoff      = 0;
    vma->vm_file      = NULL;
    vma->vm_raend      = 0;

    vma->vm_private_data = ctx;    /* link to pfm_context(not yet used) */

    /*
     * now insert the vma in the vm list for the process
     */
    insert_vm_struct(mm, vma);

    mm->total_vm  += size >> PAGE_SHIFT;

    /*
     * that's the address returned to the user
     */
    *user_addr = (void *)addr;

    return 0;

    /* outlined error handling */
no_addr:
    DBprintk(("Cannot find unmapped area for size %ld\n", size));
    return -ENOMEM;
no_vma:
    DBprintk(("Cannot allocate vma\n"));
    return -ENOMEM;
cant_remap:
    DBprintk(("Can't remap buffer\n"));
    rvfree(smpl_buf, size);
no_buffer:
    DBprintk(("Can't allocate sampling buffer\n"));
    kmem_cache_free(vm_area_cachep, vma);
    return -ENOMEM;
no_buffer_desc:
    DBprintk(("Can't allocate sampling buffer descriptor\n"));
    kmem_cache_free(vm_area_cachep, vma);
    rvfree(smpl_buf, size);
    return -ENOMEM;
}

static int
pfx_is_sane(pfreq_context_t *pfx)
{
    int ctx_flags;

    /* valid signal */
    //if (pfx->notify_sig < 1 || pfx->notify_sig >= _NSIG) return -EINVAL;
    if (pfx->notify_sig !=0 && pfx->notify_sig != SIGPROF) return -EINVAL;

    /* cannot send to process 1, 0 means do not notify */
    if (pfx->notify_pid < 0 || pfx->notify_pid == 1) return -EINVAL;

    ctx_flags = pfx->flags;

    if (ctx_flags & PFM_FL_SYSTEM_WIDE) {
#ifdef CONFIG_SMP
        if (smp_num_cpus > 1) {
            printk("perfmon: system wide monitoring on SMP not yet supported\n");
            return -EINVAL;
        }
#endif
        if ((ctx_flags & PFM_FL_SMPL_OVFL_NOBLOCK) == 0) {
            printk("perfmon: system wide monitoring cannot use blocking notification mode\n");
            return -EINVAL;
        }
    }
    /* probably more to add here */

    return 0;
}

static int
pfm_context_create(int flags, perfmon_req_t *req)
{
    pfm_context_t *ctx;
    struct task_struct *task = NULL;
    perfmon_req_t tmp;
    void *uaddr = NULL;
    int ret;
    int ctx_flags;
    pid_t pid;

    /* to go away */
    if (flags) {
        printk("perfmon: use context flags instead of perfmon() flags. Obsoleted API\n");
    }

    if (copy_from_user(&tmp, req, sizeof(tmp))) return -EFAULT;

    ret = pfx_is_sane(&tmp.pfr_ctx);
    if (ret < 0) return ret;

    ctx_flags = tmp.pfr_ctx.flags;

    if (ctx_flags & PFM_FL_SYSTEM_WIDE) {
        /*
         * XXX: This is not AT ALL SMP safe
         */
        if (pfs_info.pfs_proc_sessions > 0) return -EBUSY;
        if (pfs_info.pfs_sys_session > 0) return -EBUSY;

        pfs_info.pfs_sys_session = 1;

    } else if (pfs_info.pfs_sys_session >0) {
        /* no per-process monitoring while there is a system wide session */
        return -EBUSY;
    } else
        pfs_info.pfs_proc_sessions++;

    ctx = pfm_context_alloc();
    if (!ctx) goto error;

    /* record the creator (debug only) */
    ctx->ctx_creator = current;

    pid = tmp.pfr_ctx.notify_pid;

    spin_lock_init(&ctx->ctx_notify_lock);

    if (pid == current->pid) {
        ctx->ctx_notify_task = task = current;
        current->thread.pfm_context = ctx;

        atomic_set(&current->thread.pfm_notifiers_check, 1);

    } else if (pid!=0) {
        read_lock(&tasklist_lock);

        task = find_task_by_pid(pid);
        if (task) {
            /*
              * record who to notify
              */
            ctx->ctx_notify_task = task;

            /* 
              * make visible
              * must be done inside critical section
              *
              * if the initialization does not go through it is still
              * okay because child will do the scan for nothing which
              * won't hurt.
              */
            current->thread.pfm_context = ctx;

            /*
             * will cause task to check on exit for monitored
             * processes that would notify it. see release_thread()
             * Note: the scan MUST be done in release thread, once the
             * task has been detached from the tasklist otherwise you are
             * exposed to race conditions.
             */
            atomic_add(1, &task->thread.pfm_notifiers_check);
        }
        read_unlock(&tasklist_lock);
    }

    /*
     * notification process does not exist
     */
    if (pid != 0 && task == NULL) {
        ret = -EINVAL;
        goto buffer_error;
    }

    ctx->ctx_notify_sig = SIGPROF;    /* siginfo imposes a fixed signal */

    if (tmp.pfr_ctx.smpl_entries) {
        DBprintk((" sampling entries=%ld\n",tmp.pfr_ctx.smpl_entries));

        ret = pfm_smpl_buffer_alloc(ctx, tmp.pfr_ctx.smpl_regs, 
                         tmp.pfr_ctx.smpl_entries, &uaddr);
        if (ret<0) goto buffer_error;

        tmp.pfr_ctx.smpl_vaddr = uaddr;
    }
    /* initialization of context's flags */
    ctx->ctx_fl_inherit  = ctx_flags & PFM_FL_INHERIT_MASK;
    ctx->ctx_fl_noblock  = (ctx_flags & PFM_FL_SMPL_OVFL_NOBLOCK) ? 1 : 0;
    ctx->ctx_fl_system   = (ctx_flags & PFM_FL_SYSTEM_WIDE) ? 1: 0;
    ctx->ctx_fl_exclintr = (ctx_flags & PFM_FL_EXCL_INTR) ? 1: 0;
    ctx->ctx_fl_frozen   = 0;

    /* 
     * Keep track of the pmds we want to sample
     * XXX: may be we don't need to save/restore the DEAR/IEAR pmds
     * but we do need the BTB for sure. This is because of a hardware
     * buffer of 1 only for non-BTB pmds.
     */
    ctx->ctx_used_pmds[0] = tmp.pfr_ctx.smpl_regs;
    ctx->ctx_used_pmcs[0] = 1; /* always save/restore PMC[0] */

    sema_init(&ctx->ctx_restart_sem, 0); /* init this semaphore to locked */


    if (copy_to_user(req, &tmp, sizeof(tmp))) {
        ret = -EFAULT;
        goto buffer_error;
    }

    DBprintk((" context=%p, pid=%d notify_sig %d notify_task=%p\n",(void *)ctx, current->pid, ctx->ctx_notify_sig, ctx->ctx_notify_task));
    DBprintk((" context=%p, pid=%d flags=0x%x inherit=%d noblock=%d system=%d\n",(void *)ctx, current->pid, ctx_flags, ctx->ctx_fl_inherit, ctx->ctx_fl_noblock, ctx->ctx_fl_system));

    /*
     * when no notification is required, we can make this visible at the last moment
     */
    if (pid == 0) current->thread.pfm_context = ctx;

    /*
     * by default, we always include interrupts for system wide
     * DCR.pp is set by default to zero by kernel  in cpu_init()
     */
    if (ctx->ctx_fl_system) {
        if (ctx->ctx_fl_exclintr == 0) {
            unsigned long dcr = ia64_get_dcr();

            ia64_set_dcr(dcr|IA64_DCR_PP);
            /*
            * keep track of the kernel default value
             */
            pfs_info.pfs_dfl_dcr = dcr;

            DBprintk((" dcr.pp is set\n"));
        }
    } 

    return 0;

buffer_error:
    pfm_context_free(ctx);
error:
    /*
     * undo session reservation
     */
    if (ctx_flags & PFM_FL_SYSTEM_WIDE) {
        pfs_info.pfs_sys_session = 0;
    } else {
        pfs_info.pfs_proc_sessions--;
    }
    return ret;
}

static void
pfm_reset_regs(pfm_context_t *ctx)
{
    unsigned long mask = ctx->ctx_ovfl_regs;
    int i, cnum;

    DBprintk((" ovfl_regs=0x%lx\n", mask));
    /*
     * now restore reset value on sampling overflowed counters
     */
    for(i=0, cnum=PMU_FIRST_COUNTER; i < pmu_conf.max_counters; i++, cnum++, mask >>= 1) {
        if (mask & 0x1) {
            DBprintk((" reseting PMD[%d]=%lx\n", cnum, ctx->ctx_pmds[i].smpl_rval & pmu_conf.perf_ovfl_val));

            /* upper part is ignored on rval */
            ia64_set_pmd(cnum, ctx->ctx_pmds[i].smpl_rval);

            /*
             * we must reset BTB index (clears pmd16.full to make
             * sure we do not report the same branches twice.
             * The non-blocking case in handled in update_counters()
             */
            if (cnum == ctx->ctx_btb_counter) {
                DBprintk(("reseting PMD16\n"));
                ia64_set_pmd(16, 0);
            }
        }
    }
    /* just in case ! */
    ctx->ctx_ovfl_regs = 0;
}

static int
pfm_write_pmcs(struct task_struct *ta, perfmon_req_t *req, int count)
{
    struct thread_struct *th = &ta->thread;
    pfm_context_t *ctx = th->pfm_context;
    perfmon_req_t tmp;
    unsigned long cnum;
    int i;

    /* XXX: ctx locking may be required here */

    for (i = 0; i < count; i++, req++) {

        if (copy_from_user(&tmp, req, sizeof(tmp))) return -EFAULT;

        cnum = tmp.pfr_reg.reg_num;

        /* XXX needs to check validity of the data maybe */
        if (!PMC_IS_IMPL(cnum)) {
            DBprintk((" invalid pmc[%ld]\n", cnum));
            return -EINVAL;
        }

        if (PMC_IS_COUNTER(cnum)) {

            /*
             * we keep track of EARS/BTB to speed up sampling later
             */
            if (PMC_IS_DEAR(&tmp.pfr_reg.reg_value)) {
                ctx->ctx_dear_counter = cnum;
            } else if (PMC_IS_IEAR(&tmp.pfr_reg.reg_value)) {
                ctx->ctx_iear_counter = cnum;
            } else if (PMC_IS_BTB(&tmp.pfr_reg.reg_value)) {
                ctx->ctx_btb_counter = cnum;
            }
#if 0
            if (tmp.pfr_reg.reg_flags & PFM_REGFL_OVFL_NOTIFY)
                ctx->ctx_pmds[cnum - PMU_FIRST_COUNTER].flags |= PFM_REGFL_OVFL_NOTIFY;
#endif
        }
        /* keep track of what we use */
        CTX_USED_PMC(ctx, cnum);
        ia64_set_pmc(cnum, tmp.pfr_reg.reg_value);

        DBprintk((" setting PMC[%ld]=0x%lx flags=0x%x used_pmcs=0%lx\n", cnum, tmp.pfr_reg.reg_value, ctx->ctx_pmds[cnum - PMU_FIRST_COUNTER].flags, ctx->ctx_used_pmcs[0]));

    }
    /*
     * we have to set this here event hough we haven't necessarily started monitoring
     * because we may be context switched out
     */
    if (ctx->ctx_fl_system==0) th->flags |= IA64_THREAD_PM_VALID;

    return 0;
}

static int
pfm_write_pmds(struct task_struct *ta, perfmon_req_t *req, int count)
{
    struct thread_struct *th = &ta->thread;
    pfm_context_t *ctx = th->pfm_context;
    perfmon_req_t tmp;
    unsigned long cnum;
    int i;

    /* XXX: ctx locking may be required here */

    for (i = 0; i < count; i++, req++) {
        int k;

        if (copy_from_user(&tmp, req, sizeof(tmp))) return -EFAULT;

        cnum = tmp.pfr_reg.reg_num;

        k = cnum - PMU_FIRST_COUNTER;

        if (!PMD_IS_IMPL(cnum)) return -EINVAL;

        /* update virtualized (64bits) counter */
        if (PMD_IS_COUNTER(cnum)) {
            ctx->ctx_pmds[k].ival = tmp.pfr_reg.reg_value;
            ctx->ctx_pmds[k].val  = tmp.pfr_reg.reg_value & ~pmu_conf.perf_ovfl_val;
            ctx->ctx_pmds[k].smpl_rval = tmp.pfr_reg.reg_smpl_reset;
            ctx->ctx_pmds[k].ovfl_rval = tmp.pfr_reg.reg_ovfl_reset;

            if (tmp.pfr_reg.reg_flags & PFM_REGFL_OVFL_NOTIFY)
                ctx->ctx_pmds[cnum - PMU_FIRST_COUNTER].flags |= PFM_REGFL_OVFL_NOTIFY;
        }
        /* keep track of what we use */
        CTX_USED_PMD(ctx, cnum);

        /* writes to unimplemented part is ignored, so this is safe */
        ia64_set_pmd(cnum, tmp.pfr_reg.reg_value);

        /* to go away */
        ia64_srlz_d();
        DBprintk((" setting PMD[%ld]:  ovfl_notify=%d pmd.val=0x%lx pmd.ovfl_rval=0x%lx pmd.smpl_rval=0x%lx pmd=%lx used_pmds=0%lx\n",
                    cnum,
                    PMD_OVFL_NOTIFY(ctx, cnum - PMU_FIRST_COUNTER),
                    ctx->ctx_pmds[k].val,
                    ctx->ctx_pmds[k].ovfl_rval,
                    ctx->ctx_pmds[k].smpl_rval,
                    ia64_get_pmd(cnum) & pmu_conf.perf_ovfl_val,
                    ctx->ctx_used_pmds[0]));
    }
    /*
     * we have to set this here event hough we haven't necessarily started monitoring
     * because we may be context switched out
     */
    if (ctx->ctx_fl_system==0) th->flags |= IA64_THREAD_PM_VALID;

    return 0;
}

static int
pfm_read_pmds(struct task_struct *ta, perfmon_req_t *req, int count)
{
    struct thread_struct *th = &ta->thread;
    pfm_context_t *ctx = th->pfm_context;
    unsigned long val=0;
    perfmon_req_t tmp;
    int i;

    /*
     * XXX: MUST MAKE SURE WE DON"T HAVE ANY PENDING OVERFLOW BEFORE READING
     * This is required when the monitoring has been stoppped by user of kernel.
     * If ity is still going on, then that's fine because we a re not gauranteed
     * to return an accurate value in this case
     */

    /* XXX: ctx locking may be required here */

    for (i = 0; i < count; i++, req++) {
        unsigned long reg_val = ~0, ctx_val = ~0;

        if (copy_from_user(&tmp, req, sizeof(tmp))) return -EFAULT;

        if (!PMD_IS_IMPL(tmp.pfr_reg.reg_num)) return -EINVAL;

        if (PMD_IS_COUNTER(tmp.pfr_reg.reg_num)) {
            if (ta == current){
                val = ia64_get_pmd(tmp.pfr_reg.reg_num);
            } else {
                val = reg_val = th->pmd[tmp.pfr_reg.reg_num];
            }
            val &= pmu_conf.perf_ovfl_val;
            /*
             * lower part of .val may not be zero, so we must be an addition because of
             * residual count (see update_counters).
             */
            val += ctx_val = ctx->ctx_pmds[tmp.pfr_reg.reg_num - PMU_FIRST_COUNTER].val;
        } else {
            /* for now */
            if (ta != current) return -EINVAL;

            ia64_srlz_d();
            val = ia64_get_pmd(tmp.pfr_reg.reg_num);
        }
        tmp.pfr_reg.reg_value = val;

        DBprintk((" reading PMD[%ld]=0x%lx reg=0x%lx ctx_val=0x%lx pmc=0x%lx\n", 
                    tmp.pfr_reg.reg_num, val, reg_val, ctx_val, ia64_get_pmc(tmp.pfr_reg.reg_num)));

        if (copy_to_user(req, &tmp, sizeof(tmp))) return -EFAULT;
    }
    return 0;
}

static int
pfm_do_restart(struct task_struct *task)
{
    struct thread_struct *th = &task->thread;
    pfm_context_t *ctx = th->pfm_context;
    void *sem = &ctx->ctx_restart_sem;

    if (task == current) {
        DBprintk((" restarting self %d frozen=%d \n", current->pid, ctx->ctx_fl_frozen));

        pfm_reset_regs(ctx);

        /*
         * We ignore block/don't block because we never block
         * for a self-monitoring process.
         */
        ctx->ctx_fl_frozen = 0;

        if (CTX_HAS_SMPL(ctx)) {
            ctx->ctx_smpl_buf->psb_hdr->hdr_count = 0;
            ctx->ctx_smpl_buf->psb_index = 0;
        }

        /* pfm_reset_smpl_buffers(ctx,th->pfm_ovfl_regs);*/

        /* simply unfreeze */
        ia64_set_pmc(0, 0);
        ia64_srlz_d();

        return 0;
    }

    /* check if blocking */
    if (CTX_OVFL_NOBLOCK(ctx) == 0) {
        DBprintk((" unblocking %d \n", task->pid));
        up(sem);
        return 0;
    }

    /*
     * in case of non blocking mode, then it's just a matter of
     * of reseting the sampling buffer (if any) index. The PMU
     * is already active.
     */

    /*
     * must reset the header count first
     */
    if (CTX_HAS_SMPL(ctx)) {
        DBprintk((" resetting sampling indexes for %d \n", task->pid));
        ctx->ctx_smpl_buf->psb_hdr->hdr_count = 0;
        ctx->ctx_smpl_buf->psb_index = 0;
    }

    return 0;
}

/*
 * system-wide mode: propagate activation/desactivation throughout the tasklist
 *
 * XXX: does not work for SMP, of course
 */
static void
pfm_process_tasklist(int cmd)
{
    struct task_struct *p;
    struct pt_regs *regs;

    for_each_task(p) {
        regs = (struct pt_regs *)((unsigned long)p + IA64_STK_OFFSET);
        regs--;
        ia64_psr(regs)->pp = cmd;
    }
}

static int
do_perfmonctl (struct task_struct *task, int cmd, int flags, perfmon_req_t *req, int count, struct pt_regs *regs)
{
    perfmon_req_t tmp;
    struct thread_struct *th = &task->thread;
    pfm_context_t *ctx = th->pfm_context;

    memset(&tmp, 0, sizeof(tmp));

    if (ctx == NULL && cmd != PFM_CREATE_CONTEXT && cmd < PFM_DEBUG_BASE) {
        DBprintk((" PFM_WRITE_PMCS: no context for task %d\n", task->pid));
        return -EINVAL;
    }

    switch (cmd) {
        case PFM_CREATE_CONTEXT:
            /* a context has already been defined */
            if (ctx) return -EBUSY;

            /*
             * cannot directly create a context in another process
             */
            if (task != current) return -EINVAL;

            if (req == NULL || count != 1) return -EINVAL;

            if (!access_ok(VERIFY_READ, req, sizeof(struct perfmon_req_t)*count)) return -EFAULT;

            return pfm_context_create(flags, req);

        case PFM_WRITE_PMCS:
            /* we don't quite support this right now */
            if (task != current) return -EINVAL;

            if (!access_ok(VERIFY_READ, req, sizeof(struct perfmon_req_t)*count)) return -EFAULT;

            return pfm_write_pmcs(task, req, count);

        case PFM_WRITE_PMDS:
            /* we don't quite support this right now */
            if (task != current) return -EINVAL;

            if (!access_ok(VERIFY_READ, req, sizeof(struct perfmon_req_t)*count)) return -EFAULT;

            return pfm_write_pmds(task, req, count);

        case PFM_START:
            /* we don't quite support this right now */
            if (task != current) return -EINVAL;

            if (PMU_OWNER()  && PMU_OWNER() != current && PFM_CAN_DO_LAZY()) pfm_lazy_save_regs(PMU_OWNER());

            SET_PMU_OWNER(current);

            /* will start monitoring right after rfi */
            ia64_psr(regs)->up = 1;
            ia64_psr(regs)->pp = 1;

            if (ctx->ctx_fl_system) {
                pfm_process_tasklist(1);
                pfs_info.pfs_pp = 1;
            }

            /*
             * mark the state as valid.
             * this will trigger save/restore at context switch
             */
            if (ctx->ctx_fl_system==0) th->flags |= IA64_THREAD_PM_VALID;

            ia64_set_pmc(0, 0);
            ia64_srlz_d();

            break;

        case PFM_ENABLE:
            /* we don't quite support this right now */
            if (task != current) return -EINVAL;

            if (PMU_OWNER()  && PMU_OWNER() != current && PFM_CAN_DO_LAZY()) pfm_lazy_save_regs(PMU_OWNER());

            /* reset all registers to stable quiet state */
            ia64_reset_pmu();

            /* make sure nothing starts */
            ia64_psr(regs)->up = 0;
            ia64_psr(regs)->pp = 0;

            /* do it on the live register as well */
            __asm__ __volatile__ ("rsm psr.pp|psr.pp;;"::: "memory");

            SET_PMU_OWNER(current);

            /*
             * mark the state as valid.
             * this will trigger save/restore at context switch
             */
            if (ctx->ctx_fl_system==0) th->flags |= IA64_THREAD_PM_VALID;

            /* simply unfreeze */
            ia64_set_pmc(0, 0);
            ia64_srlz_d();
            break;

        case PFM_DISABLE:
            /* we don't quite support this right now */
            if (task != current) return -EINVAL;

            /* simply freeze */
            ia64_set_pmc(0, 1);
            ia64_srlz_d();
            /*
             * XXX: cannot really toggle IA64_THREAD_PM_VALID
             * but context is still considered valid, so any 
             * read request would return something valid. Same
             * thing when this task terminates (pfm_flush_regs()).
             */
            break;

        case PFM_READ_PMDS:
            if (!access_ok(VERIFY_READ, req, sizeof(struct perfmon_req_t)*count)) return -EFAULT;
            if (!access_ok(VERIFY_WRITE, req, sizeof(struct perfmon_req_t)*count)) return -EFAULT;

            return pfm_read_pmds(task, req, count);

          case PFM_STOP:
            /* we don't quite support this right now */
            if (task != current) return -EINVAL;

            /* simply stop monitors, not PMU */
            ia64_psr(regs)->up = 0;
            ia64_psr(regs)->pp = 0;

            if (ctx->ctx_fl_system) {
                pfm_process_tasklist(0);
                pfs_info.pfs_pp = 0;
            }

            break;

          case PFM_RESTART: /* temporary, will most likely end up as a PFM_ENABLE */

            if ((th->flags & IA64_THREAD_PM_VALID) == 0 && ctx->ctx_fl_system==0) {
                printk(" PFM_RESTART not monitoring\n");
                return -EINVAL;
            }
            if (CTX_OVFL_NOBLOCK(ctx) == 0 && ctx->ctx_fl_frozen==0) {
                printk("task %d without pmu_frozen set\n", task->pid);
                return -EINVAL;
            }

            return pfm_do_restart(task); /* we only look at first entry */

          case PFM_DESTROY_CONTEXT:
            /* we don't quite support this right now */
            if (task != current) return -EINVAL;

            /* first stop monitors */
            ia64_psr(regs)->up = 0;
            ia64_psr(regs)->pp = 0;

            /* then freeze PMU */
            ia64_set_pmc(0, 1);
            ia64_srlz_d();

            /* don't save/restore on context switch */
            if (ctx->ctx_fl_system ==0) task->thread.flags &= ~IA64_THREAD_PM_VALID;

            SET_PMU_OWNER(NULL);

            /* now free context and related state */
            pfm_context_exit(task);
            break;

          case PFM_DEBUG_ON:
            printk("perfmon debugging on\n");
            pfm_debug = 1;
            break;

          case PFM_DEBUG_OFF:
            printk("perfmon debugging off\n");
            pfm_debug = 0;
            break;

          default:
            DBprintk((" UNknown command 0x%x\n", cmd));
            return -EINVAL;
    }
    return 0;
}

/*
 * XXX: do something better here
 */
static int
perfmon_bad_permissions(struct task_struct *task)
{
    /* stolen from bad_signal() */
    return (current->session != task->session)
        && (current->euid ^ task->suid) && (current->euid ^ task->uid)
        && (current->uid ^ task->suid) && (current->uid ^ task->uid);
}

asmlinkage int
sys_perfmonctl (int pid, int cmd, int flags, perfmon_req_t *req, int count, long arg6, long arg7, long arg8, long stack)
{
    struct pt_regs *regs = (struct pt_regs *) &stack;
    struct task_struct *child = current;
    int ret = -ESRCH;

    /* sanity check:
     *
     * ensures that we don't do bad things in case the OS
     * does not have enough storage to save/restore PMC/PMD
     */
    if (PERFMON_IS_DISABLED()) return -ENOSYS;

    /* XXX: pid interface is going away in favor of pfm context */
    if (pid != current->pid) {
        read_lock(&tasklist_lock);

        child = find_task_by_pid(pid);

        if (!child) goto abort_call;

        ret = -EPERM;

        if (perfmon_bad_permissions(child)) goto abort_call;

        /*
         * XXX: need to do more checking here
         */
        if (child->state != TASK_ZOMBIE && child->state != TASK_STOPPED) {
            DBprintk((" warning process %d not in stable state %ld\n", pid, child->state));
        }
    }
    ret = do_perfmonctl(child, cmd, flags, req, count, regs);

abort_call:
    if (child != current) read_unlock(&tasklist_lock);

    return ret;
}

#if __GNUC__ >= 3
void asmlinkage
pfm_block_on_overflow(void)
#else
void asmlinkage
pfm_block_on_overflow(u64 arg0, u64 arg1, u64 arg2, u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7)
#endif
{
    struct thread_struct *th = &current->thread;
    pfm_context_t *ctx = current->thread.pfm_context;
    int ret;

    /*
     * NO matter what notify_pid is,
     * we clear overflow, won't notify again
     */
    th->pfm_must_block = 0;

    /*
     * do some sanity checks first
     */
    if (!ctx) {
        printk("perfmon: process %d has no PFM context\n", current->pid);
        return;
    }
    if (ctx->ctx_notify_task == 0) {
        printk("perfmon: process %d has no task to notify\n", current->pid);
        return;
    }

    DBprintk((" current=%d task=%d\n", current->pid, ctx->ctx_notify_task->pid));

    /* should not happen */
    if (CTX_OVFL_NOBLOCK(ctx)) {
        printk("perfmon: process %d non-blocking ctx should not be here\n", current->pid);
        return;
    }

    DBprintk((" CPU%d %d before sleep\n", smp_processor_id(), current->pid));

    /*
     * may go through without blocking on SMP systems
     * if restart has been received already by the time we call down()
     */
    ret = down_interruptible(&ctx->ctx_restart_sem);

    DBprintk((" CPU%d %d after sleep ret=%d\n", smp_processor_id(), current->pid, ret));

    /*
     * in case of interruption of down() we don't restart anything
     */
    if (ret >= 0) {
        /* we reactivate on context switch */
        ctx->ctx_fl_frozen = 0;
        /*
         * the ovfl_sem is cleared by the restart task and this is safe because we always
         * use the local reference
         */

        pfm_reset_regs(ctx);

        /*
         * Unlock sampling buffer and reset index atomically
         * XXX: not really needed when blocking
         */
        if (CTX_HAS_SMPL(ctx)) {
            ctx->ctx_smpl_buf->psb_hdr->hdr_count = 0;
            ctx->ctx_smpl_buf->psb_index = 0;
        }

        DBprintk((" CPU%d %d unfreeze PMU\n", smp_processor_id(), current->pid));

        ia64_set_pmc(0, 0);
        ia64_srlz_d();

        /* state restored, can go back to work (user mode) */
    }
}

/*
 * main overflow processing routine.
 * it can be called from the interrupt path or explicitely during the context switch code
 * Return:
 *    new value of pmc[0]. if 0x0 then unfreeze, else keep frozen
 */
unsigned long
update_counters (struct task_struct *task, u64 pmc0, struct pt_regs *regs)
{
    unsigned long mask, i, cnum;
    struct thread_struct *th;
    pfm_context_t *ctx;
    unsigned long bv = 0;
    int my_cpu = smp_processor_id();
    int ret = 1, buffer_is_full = 0;
    int ovfl_has_long_recovery, can_notify, need_reset_pmd16=0;
    struct siginfo si;

    /*
     * It is never safe to access the task for which the overflow interrupt is destinated
     * using the current variable as the interrupt may occur in the middle of a context switch
     * where current does not hold the task that is running yet.
     *
     * For monitoring, however, we do need to get access to the task which caused the overflow
     * to account for overflow on the counters.
     *
     * We accomplish this by maintaining a current owner of the PMU per CPU. During context
     * switch the ownership is changed in a way such that the reflected owner is always the
     * valid one, i.e. the one that caused the interrupt.
     */

    if (task == NULL) {
        DBprintk((" owners[%d]=NULL\n", my_cpu));
        return 0x1;
    }
    th  = &task->thread;
    ctx = th->pfm_context;

    /*
     * XXX: debug test
     * Don't think this could happen given upfront tests
     */
    if ((th->flags & IA64_THREAD_PM_VALID) == 0 && ctx->ctx_fl_system == 0) {
        printk("perfmon: Spurious overflow interrupt: process %d not using perfmon\n", task->pid);
        return 0x1;
    }
    if (!ctx) {
        printk("perfmon: Spurious overflow interrupt: process %d has no PFM context\n", task->pid);
        return 0;
    }

    /*
     * sanity test. Should never happen
     */
    if ((pmc0 & 0x1 )== 0) {
        printk("perfmon: pid %d pmc0=0x%lx assumption error for freeze bit\n", task->pid, pmc0);
        return 0x0;
    }

    mask = pmc0 >> PMU_FIRST_COUNTER;

    DBprintk(("pmc0=0x%lx pid=%d owner=%d iip=0x%lx, ctx is in %s mode used_pmds=0x%lx used_pmcs=0x%lx\n", 
                pmc0, task->pid, PMU_OWNER()->pid, regs->cr_iip, 
                CTX_OVFL_NOBLOCK(ctx) ? "NO-BLOCK" : "BLOCK",
                ctx->ctx_used_pmds[0],
                ctx->ctx_used_pmcs[0]));

    /*
     * XXX: need to record sample only when an EAR/BTB has overflowed
     */
    if (CTX_HAS_SMPL(ctx)) {
        pfm_smpl_buffer_desc_t *psb = ctx->ctx_smpl_buf;
        unsigned long *e, m, idx=0;
        perfmon_smpl_entry_t *h;
        int j;

        idx = ia64_fetch_and_add(1, &psb->psb_index);
        DBprintk((" recording index=%ld entries=%ld\n", idx, psb->psb_entries));

        /*
         * XXX: there is a small chance that we could run out on index before resetting
         * but index is unsigned long, so it will take some time.....
         * We use > instead of == because fetch_and_add() is off by one (see below)
         *
         * This case can happen in non-blocking mode or with multiple processes.
         * For non-blocking, we need to reload and continue.
         */
        if (idx > psb->psb_entries) {
            buffer_is_full = 1;
            goto reload_pmds;
        }

        /* first entry is really entry 0, not 1 caused by fetch_and_add */
        idx--;

        h = (perfmon_smpl_entry_t *)(((char *)psb->psb_addr) + idx*(psb->psb_entry_size));

        h->pid  = task->pid;
        h->cpu  = my_cpu;
        h->rate = 0;
        h->ip   = regs ? regs->cr_iip : 0x0; /* where did the fault happened */
        h->regs = mask; /* which registers overflowed */

        /* guaranteed to monotonically increase on each cpu */
        h->stamp = perfmon_get_stamp();

        e = (unsigned long *)(h+1);

        /*
         * selectively store PMDs in increasing index number
         */
        for (j=0, m = ctx->ctx_smpl_regs; m; m >>=1, j++) {
            if (m & 0x1) {
                if (PMD_IS_COUNTER(j))
                    *e =  ctx->ctx_pmds[j-PMU_FIRST_COUNTER].val
                        + (ia64_get_pmd(j) & pmu_conf.perf_ovfl_val);
                else {
                    *e = ia64_get_pmd(j); /* slow */
                }
                DBprintk((" e=%p pmd%d =0x%lx\n", (void *)e, j, *e));
                e++;
            }
        }
        /*
         * make the new entry visible to user, needs to be atomic
         */
        ia64_fetch_and_add(1, &psb->psb_hdr->hdr_count);

        DBprintk((" index=%ld entries=%ld hdr_count=%ld\n", idx, psb->psb_entries, psb->psb_hdr->hdr_count));
        /* 
         * sampling buffer full ? 
         */
        if (idx == (psb->psb_entries-1)) {
            /*
             * will cause notification, cannot be 0
             */
            bv = mask << PMU_FIRST_COUNTER;

            buffer_is_full = 1;

            DBprintk((" sampling buffer full must notify bv=0x%lx\n", bv));

            /*
             * we do not reload here, when context is blocking
             */
            if (!CTX_OVFL_NOBLOCK(ctx)) goto no_reload;

            /*
             * here, we have a full buffer but we are in non-blocking mode
             * so we need to reload overflowed PMDs with sampling reset values
             * and restart right away.
             */
        }
        /* FALL THROUGH */
    }
reload_pmds:

    /*
     * in the case of a non-blocking context, we reload
     * with the ovfl_rval when no user notification is taking place (short recovery)
     * otherwise when the buffer is full which requires user interaction) then we use
     * smpl_rval which is the long_recovery path (disturbance introduce by user execution).
     *
     * XXX: implies that when buffer is full then there is always notification.
     */
    ovfl_has_long_recovery = CTX_OVFL_NOBLOCK(ctx) && buffer_is_full;

    /*
     * XXX: CTX_HAS_SMPL() should really be something like CTX_HAS_SMPL() and is activated,i.e.,
     * one of the PMC is configured for EAR/BTB.
     *
     * When sampling, we can only notify when the sampling buffer is full.
     */
    can_notify   = CTX_HAS_SMPL(ctx) == 0 && ctx->ctx_notify_task;

    DBprintk((" ovfl_has_long_recovery=%d can_notify=%d\n", ovfl_has_long_recovery, can_notify));

    for (i = 0, cnum = PMU_FIRST_COUNTER; mask ; cnum++, i++, mask >>= 1) {

        if ((mask & 0x1) == 0) continue;

        DBprintk((" PMD[%ld] overflowed pmd=0x%lx pmod.val=0x%lx\n", cnum, ia64_get_pmd(cnum), ctx->ctx_pmds[i].val));

        /*
         * Because we sometimes (EARS/BTB) reset to a specific value, we cannot simply use
         * val to count the number of times we overflowed. Otherwise we would loose the current value
         * in the PMD (which can be >0). So to make sure we don't loose
         * the residual counts we set val to contain full 64bits value of the counter.
         *
         * XXX: is this needed for EARS/BTB ?
         */
        ctx->ctx_pmds[i].val += 1 + pmu_conf.perf_ovfl_val
                      + (ia64_get_pmd(cnum) & pmu_conf.perf_ovfl_val); /* slow */

        DBprintk((" pmod[%ld].val=0x%lx pmd=0x%lx\n", i, ctx->ctx_pmds[i].val, ia64_get_pmd(cnum)&pmu_conf.perf_ovfl_val));

        if (can_notify && PMD_OVFL_NOTIFY(ctx, i)) {
            DBprintk((" CPU%d should notify task %p with signal %d\n", my_cpu, ctx->ctx_notify_task, ctx->ctx_notify_sig));
            bv |= 1 << i;
        } else {
            DBprintk((" CPU%d PMD[%ld] overflow, no notification\n", my_cpu, cnum));
            /*
             * In case no notification is requested, we reload the reset value right away
             * otherwise we wait until the notify_pid process has been called and has
             * has finished processing data. Check out pfm_overflow_notify()
             */

            /* writes to upper part are ignored, so this is safe */
            if (ovfl_has_long_recovery) {
                DBprintk((" CPU%d PMD[%ld] reload with smpl_val=%lx\n", my_cpu, cnum,ctx->ctx_pmds[i].smpl_rval));
                ia64_set_pmd(cnum, ctx->ctx_pmds[i].smpl_rval);
            } else {
                DBprintk((" CPU%d PMD[%ld] reload with ovfl_val=%lx\n", my_cpu, cnum,ctx->ctx_pmds[i].smpl_rval));
                ia64_set_pmd(cnum, ctx->ctx_pmds[i].ovfl_rval);
            }
        }
        if (cnum == ctx->ctx_btb_counter) need_reset_pmd16=1;
    }
    /*
     * In case of BTB overflow we need to reset the BTB index.
     */
    if (need_reset_pmd16) {
        DBprintk(("reset PMD16\n"));
        ia64_set_pmd(16, 0);
    }

no_reload:

    /*
     * some counters overflowed, but they did not require
     * user notification, so after having reloaded them above
     * we simply restart
     */
    if (!bv) return 0x0;

    ctx->ctx_ovfl_regs  = bv; /* keep track of what to reset when unblocking */
    /*
     * Now we know that:
     *     - we have some counters which overflowed (contains in bv)
     *     - someone has asked to be notified on overflow. 
     */

    
    /*
     * If the notification task is still present, then notify_task is non
     * null. It is clean by that task if it ever exits before we do. 
     */

    if (ctx->ctx_notify_task) {

        si.si_errno    = 0;
        si.si_addr     = NULL;
        si.si_pid      = task->pid; /* who is sending */

        si.si_signo    = ctx->ctx_notify_sig; /* is SIGPROF */
        si.si_code     = PROF_OVFL; /* goes to user */
        si.si_pfm_ovfl = bv;


    
        /*
         * when the target of the signal is not ourself, we have to be more
         * careful. The notify_task may being cleared by the target task itself
         * in release_thread(). We must ensure mutual exclusion here such that
         * the signal is delivered (even to a dying task) safely.
         */

        if (ctx->ctx_notify_task != current) {
            /*
             * grab the notification lock for this task
             */
            spin_lock(&ctx->ctx_notify_lock);

            /*
             * now notify_task cannot be modified until we're done
             * if NULL, they it got modified while we were in the handler
             */
            if (ctx->ctx_notify_task == NULL) {
                spin_unlock(&ctx->ctx_notify_lock);
                goto lost_notify;
            }
            /*
             * required by send_sig_info() to make sure the target
             * task does not disappear on us.
             */
            read_lock(&tasklist_lock);
        }
        /*
          * in this case, we don't stop the task, we let it go on. It will
          * necessarily go to the signal handler (if any) when it goes back to
          * user mode.
          */
        DBprintk((" %d sending %d notification to %d\n", task->pid, si.si_signo, ctx->ctx_notify_task->pid));


        /* 
         * this call is safe in an interrupt handler, so does read_lock() on tasklist_lock
         */
        ret = send_sig_info(ctx->ctx_notify_sig, &si, ctx->ctx_notify_task);
        if (ret != 0) printk(" send_sig_info(process %d, SIGPROF)=%d\n",  ctx->ctx_notify_task->pid, ret);
        /*
         * now undo the protections in order
         */
        if (ctx->ctx_notify_task != current) {
            read_unlock(&tasklist_lock);
            spin_unlock(&ctx->ctx_notify_lock);
        }

        /*
         * if we block set the pfm_must_block bit
         * when in block mode, we can effectively block only when the notified
         * task is not self, otherwise we would deadlock. 
         * in this configuration, the notification is sent, the task will not 
         * block on the way back to user mode, but the PMU will be kept frozen
         * until PFM_RESTART.
         * Note that here there is still a race condition with notify_task
         * possibly being nullified behind our back, but this is fine because
         * it can only be changed to NULL which by construction, can only be
         * done when notify_task != current. So if it was already different
         * before, changing it to NULL will still maintain this invariant.
         * Of course, when it is equal to current it cannot change at this point.
         */
        if (!CTX_OVFL_NOBLOCK(ctx) && ctx->ctx_notify_task != current) {
                th->pfm_must_block = 1; /* will cause blocking */
        }
    } else {
lost_notify:
        DBprintk((" notification task has disappeared !\n"));
        /*
         * for a non-blocking context, we make sure we do not fall into the pfm_overflow_notify()
         * trap. Also in the case of a blocking context with lost notify process, then we do not
         * want to block either (even though it is interruptible). In this case, the PMU will be kept
         * frozen and the process will run to completion without monitoring enabled.
         *
         * Of course, we cannot loose notify process when self-monitoring.
         */
        th->pfm_must_block = 0; 

    }
    /*
     * if we block, we keep the PMU frozen. If non-blocking we restart.
     * in the case of non-blocking were the notify process is lost, we also 
     * restart. 
     */
    if (!CTX_OVFL_NOBLOCK(ctx)) 
        ctx->ctx_fl_frozen  = 1;
    else
        ctx->ctx_fl_frozen = 0;

    DBprintk((" reload pmc0=0x%x must_block=%ld\n",
                ctx->ctx_fl_frozen ? 0x1 : 0x0, th->pfm_must_block));

    return ctx->ctx_fl_frozen ? 0x1 : 0x0;
}

static void
perfmon_interrupt (int irq, void *arg, struct pt_regs *regs)
{
    u64 pmc0;
    struct task_struct *ta;

    pmc0 = ia64_get_pmc(0); /* slow */

    /*
     * if we have some pending bits set
     * assumes : if any PM[0].bit[63-1] is set, then PMC[0].fr = 1
     */
    if ((pmc0 & ~0x1) && (ta=PMU_OWNER())) {

        /* assumes, PMC[0].fr = 1 at this point */
        pmc0 = update_counters(ta, pmc0, regs);

        /*
         * if pmu_frozen = 0
         *    pmc0 = 0 and we resume monitoring right away
         * else
         *    pmc0 = 0x1 frozen but all pending bits are cleared
         */
        ia64_set_pmc(0, pmc0);
        ia64_srlz_d();
    } else {
        printk("perfmon: Spurious PMU overflow interrupt: pmc0=0x%lx owner=%p\n", pmc0, (void *)PMU_OWNER());
    }
}

/* for debug only */
static int
perfmon_proc_info(char *page)
{
    char *p = page;
    u64 pmc0 = ia64_get_pmc(0);
    int i;

    p += sprintf(p, "CPU%d.pmc[0]=%lx\nPerfmon debug: %s\n", smp_processor_id(), pmc0, pfm_debug ? "On" : "Off");
    p += sprintf(p, "proc_sessions=%lu sys_sessions=%lu\n", 
            pfs_info.pfs_proc_sessions, 
            pfs_info.pfs_sys_session);

    for(i=0; i < NR_CPUS; i++) {
        if (cpu_is_online(i)) {
            p += sprintf(p, "CPU%d.pmu_owner: %-6d\n",
                    i, 
                    pmu_owners[i].owner ? pmu_owners[i].owner->pid: -1);
        }
    }
    return p - page;
}

/* for debug only */
static int
perfmon_read_entry(char *page, char **start, off_t off, int count, int *eof, void *data)
{
    int len = perfmon_proc_info(page);

    if (len <= off+count) *eof = 1;

    *start = page + off;
    len   -= off;

    if (len>count) len = count;
    if (len<0) len = 0;

    return len;
}

static struct irqaction perfmon_irqaction = {
    handler:    perfmon_interrupt,
    flags:        SA_INTERRUPT,
    name:        "perfmon"
};

void __init
perfmon_init (void)
{
    pal_perf_mon_info_u_t pm_info;
    s64 status;

    register_percpu_irq(IA64_PERFMON_VECTOR, &perfmon_irqaction);

    ia64_set_pmv(IA64_PERFMON_VECTOR);
    ia64_srlz_d();

    pmu_conf.pfm_is_disabled = 1;

    printk("perfmon: version %s (sampling format v%d)\n", PFM_VERSION, PFM_SMPL_HDR_VERSION);
    printk("perfmon: Interrupt vectored to %u\n", IA64_PERFMON_VECTOR);

    if ((status=ia64_pal_perf_mon_info(pmu_conf.impl_regs, &pm_info)) != 0) {
        printk("perfmon: PAL call failed (%ld)\n", status);
        return;
    }
    pmu_conf.perf_ovfl_val = (1L << pm_info.pal_perf_mon_info_s.width) - 1;
    pmu_conf.max_counters  = pm_info.pal_perf_mon_info_s.generic;
    pmu_conf.num_pmcs      = find_num_pm_regs(pmu_conf.impl_regs);
    pmu_conf.num_pmds      = find_num_pm_regs(&pmu_conf.impl_regs[4]);

    printk("perfmon: %d bits counters (max value 0x%lx)\n", pm_info.pal_perf_mon_info_s.width, pmu_conf.perf_ovfl_val);
    printk("perfmon: %ld PMC/PMD pairs, %ld PMCs, %ld PMDs\n", pmu_conf.max_counters, pmu_conf.num_pmcs, pmu_conf.num_pmds);

    /* sanity check */
    if (pmu_conf.num_pmds >= IA64_NUM_PMD_REGS || pmu_conf.num_pmcs >= IA64_NUM_PMC_REGS) {
        printk(KERN_ERR "perfmon: ERROR not enough PMC/PMD storage in kernel, perfmon is DISABLED\n");
        return; /* no need to continue anyway */
    }
    /* we are all set */
    pmu_conf.pfm_is_disabled = 0;

    /*
     * Insert the tasklet in the list.
     * It is still disabled at this point, so it won't run
    printk(__FUNCTION__" tasklet is %p state=%d, count=%d\n", &perfmon_tasklet, perfmon_tasklet.state, perfmon_tasklet.count);
     */

    /*
     * for now here for debug purposes
     */
    perfmon_dir = create_proc_read_entry ("perfmon", 0, 0, perfmon_read_entry, NULL);
}

void
perfmon_init_percpu (void)
{
    ia64_set_pmv(IA64_PERFMON_VECTOR);
    ia64_srlz_d();
}

void
pfm_save_regs (struct task_struct *ta)
{
    struct task_struct *owner;
    pfm_context_t *ctx;
    struct thread_struct *t;
    u64 pmc0, psr;
    unsigned long mask;
    int i;

    t   = &ta->thread;
    ctx = ta->thread.pfm_context;

    /*
     * We must make sure that we don't loose any potential overflow
     * interrupt while saving PMU context. In this code, external
     * interrupts are always enabled.
     */

    /*
     * save current PSR: needed because we modify it
     */
    __asm__ __volatile__ ("mov %0=psr;;": "=r"(psr) :: "memory");

    /*
     * stop monitoring:
     * This is the only way to stop monitoring without destroying overflow
     * information in PMC[0].
     * This is the last instruction which can cause overflow when monitoring
     * in kernel.
     * By now, we could still have an overflow interrupt in-flight.
     */
    __asm__ __volatile__ ("rsm psr.up|psr.pp;;"::: "memory");

    /*
     * Mark the PMU as not owned
     * This will cause the interrupt handler to do nothing in case an overflow
     * interrupt was in-flight
     * This also guarantees that pmc0 will contain the final state
     * It virtually gives us full control over overflow processing from that point
     * on.
     * It must be an atomic operation.
     */
    owner = PMU_OWNER();
    SET_PMU_OWNER(NULL);

    /*
     * read current overflow status:
     *
     * we are guaranteed to read the final stable state
     */
    ia64_srlz_d();
    pmc0 = ia64_get_pmc(0); /* slow */

    /*
     * freeze PMU:
     *
     * This destroys the overflow information. This is required to make sure
     * next process does not start with monitoring on if not requested
     */
    ia64_set_pmc(0, 1);

    /*
     * Check for overflow bits and proceed manually if needed
     *
     * It is safe to call the interrupt handler now because it does
     * not try to block the task right away. Instead it will set a
     * flag and let the task proceed. The blocking will only occur
     * next time the task exits from the kernel.
     */
    if (pmc0 & ~0x1) {
        update_counters(owner, pmc0, NULL);
        /* we will save the updated version of pmc0 */
    }
    /*
     * restore PSR for context switch to save
     */
    __asm__ __volatile__ ("mov psr.l=%0;; srlz.i;;"::"r"(psr): "memory");

    /*
     * we do not save registers if we can do lazy
     */
    if (PFM_CAN_DO_LAZY()) {
        SET_PMU_OWNER(owner);
        return;
    }

    /*
     * XXX needs further optimization.
     * Also must take holes into account
     */
    mask = ctx->ctx_used_pmds[0];
    for (i=0; mask; i++, mask>>=1) {
        if (mask & 0x1) t->pmd[i] =ia64_get_pmd(i);
    }

    /* skip PMC[0], we handle it separately */
    mask = ctx->ctx_used_pmcs[0]>>1;
    for (i=1; mask; i++, mask>>=1) {
        if (mask & 0x1) t->pmc[i] = ia64_get_pmc(i);
    }
    /*
     * Throughout this code we could have gotten an overflow interrupt. It is transformed
     * into a spurious interrupt as soon as we give up pmu ownership.
     */
}

static void
pfm_lazy_save_regs (struct task_struct *ta)
{
    pfm_context_t *ctx;
    struct thread_struct *t;
    unsigned long mask;
    int i;

    DBprintk(("  on [%d] by [%d]\n", ta->pid, current->pid));

    t   = &ta->thread;
    ctx = ta->thread.pfm_context;
    /*
     * XXX needs further optimization.
     * Also must take holes into account
     */
    mask = ctx->ctx_used_pmds[0];
    for (i=0; mask; i++, mask>>=1) {
        if (mask & 0x1) t->pmd[i] =ia64_get_pmd(i);
    }
    
    /* skip PMC[0], we handle it separately */
    mask = ctx->ctx_used_pmcs[0]>>1;
    for (i=1; mask; i++, mask>>=1) {
        if (mask & 0x1) t->pmc[i] = ia64_get_pmc(i);
    }
    SET_PMU_OWNER(NULL);
}

void
pfm_load_regs (struct task_struct *ta)
{
    struct thread_struct *t = &ta->thread;
    pfm_context_t *ctx = ta->thread.pfm_context;
    struct task_struct *owner;
    unsigned long mask;
    int i;

    owner = PMU_OWNER();
    if (owner == ta) goto skip_restore;
    if (owner) pfm_lazy_save_regs(owner);

    SET_PMU_OWNER(ta);

    mask = ctx->ctx_used_pmds[0];
    for (i=0; mask; i++, mask>>=1) {
        if (mask & 0x1) ia64_set_pmd(i, t->pmd[i]);
    }

    /* skip PMC[0] to avoid side effects */
    mask = ctx->ctx_used_pmcs[0]>>1;
    for (i=1; mask; i++, mask>>=1) {
        if (mask & 0x1) ia64_set_pmc(i, t->pmc[i]);
    }
skip_restore:
    /*
     * unfreeze only when possible
     */
    if (ctx->ctx_fl_frozen == 0) {
        ia64_set_pmc(0, 0);
        ia64_srlz_d();
        /* place where we potentially (kernel level) start monitoring again */
    }
}


/*
 * This function is called when a thread exits (from exit_thread()).
 * This is a simplified pfm_save_regs() that simply flushes the current
 * register state into the save area taking into account any pending
 * overflow. This time no notification is sent because the taks is dying
 * anyway. The inline processing of overflows avoids loosing some counts.
 * The PMU is frozen on exit from this call and is to never be reenabled
 * again for this task.
 */
void
pfm_flush_regs (struct task_struct *ta)
{
    pfm_context_t *ctx;
    u64 pmc0, psr, mask;
    int i,j;

    if (ta == NULL) {
        panic(__FUNCTION__" task is NULL\n");
    }
    ctx = ta->thread.pfm_context;
    if (ctx == NULL) {
        panic(__FUNCTION__" no PFM ctx is NULL\n");
    }
    /*
     * We must make sure that we don't loose any potential overflow
     * interrupt while saving PMU context. In this code, external
     * interrupts are always enabled.
     */

    /*
     * save current PSR: needed because we modify it
     */
    __asm__ __volatile__ ("mov %0=psr;;": "=r"(psr) :: "memory");

    /*
     * stop monitoring:
     * This is the only way to stop monitoring without destroying overflow
     * information in PMC[0].
     * This is the last instruction which can cause overflow when monitoring
     * in kernel.
     * By now, we could still have an overflow interrupt in-flight.
     */
    __asm__ __volatile__ ("rsm psr.up;;"::: "memory");

    /*
     * Mark the PMU as not owned
     * This will cause the interrupt handler to do nothing in case an overflow
     * interrupt was in-flight
     * This also guarantees that pmc0 will contain the final state
     * It virtually gives us full control on overflow processing from that point
     * on.
     * It must be an atomic operation.
     */
    SET_PMU_OWNER(NULL);

    /*
     * read current overflow status:
     *
     * we are guaranteed to read the final stable state
     */
    ia64_srlz_d();
    pmc0 = ia64_get_pmc(0); /* slow */

    /*
     * freeze PMU:
     *
     * This destroys the overflow information. This is required to make sure
     * next process does not start with monitoring on if not requested
     */
    ia64_set_pmc(0, 1);
    ia64_srlz_d();

    /*
     * restore PSR for context switch to save
     */
    __asm__ __volatile__ ("mov psr.l=%0;;srlz.i;"::"r"(psr): "memory");

    /*
     * This loop flushes the PMD into the PFM context.
     * IT also processes overflow inline.
     *
     * IMPORTANT: No notification is sent at this point as the process is dying.
     * The implicit notification will come from a SIGCHILD or a return from a
     * waitpid().
     *
     * XXX: must take holes into account
     */
    mask = pmc0 >> PMU_FIRST_COUNTER;
    for (i=0,j=PMU_FIRST_COUNTER; i< pmu_conf.max_counters; i++,j++) {

        /* collect latest results */
        ctx->ctx_pmds[i].val += ia64_get_pmd(j) & pmu_conf.perf_ovfl_val;

        /*
         * now everything is in ctx_pmds[] and we need
         * to clear the saved context from save_regs() such that
         * pfm_read_pmds() gets the correct value
         */
        ta->thread.pmd[j] = 0;

        /* take care of overflow inline */
        if (mask & 0x1) {
            ctx->ctx_pmds[i].val += 1 + pmu_conf.perf_ovfl_val;
            DBprintk((" PMD[%d] overflowed pmd=0x%lx pmds.val=0x%lx\n",
            j, ia64_get_pmd(j), ctx->ctx_pmds[i].val));
        }
        mask >>=1;
    }
}

/*
 * XXX: this routine is not very portable for PMCs
 * XXX: make this routine able to work with non current context
 */
static void
ia64_reset_pmu(void)
{
    int i;

    /* PMU is frozen, no pending overflow bits */
    ia64_set_pmc(0,1);

    /* extra overflow bits + counter configs cleared */
    for(i=1; i< PMU_FIRST_COUNTER + pmu_conf.max_counters ; i++) {
        ia64_set_pmc(i,0);
    }

    /* opcode matcher set to all 1s */
    ia64_set_pmc(8,~0);
    ia64_set_pmc(9,~0);

    /* I-EAR config cleared, plm=0 */
    ia64_set_pmc(10,0);

    /* D-EAR config cleared, PMC[11].pt must be 1 */
    ia64_set_pmc(11,1 << 28);

    /* BTB config. plm=0 */
    ia64_set_pmc(12,0);

    /* Instruction address range, PMC[13].ta must be 1 */
    ia64_set_pmc(13,1);

    /* clears all PMD registers */
    for(i=0;i< pmu_conf.num_pmds; i++) {
        if (PMD_IS_IMPL(i))  ia64_set_pmd(i,0);
    }
    ia64_srlz_d();
}

/*
 * task is the newly created task
 */
int
pfm_inherit(struct task_struct *task, struct pt_regs *regs)
{
    pfm_context_t *ctx = current->thread.pfm_context;
    pfm_context_t *nctx;
    struct thread_struct *th = &task->thread;
    int i, cnum;

    /*
     * bypass completely for system wide
     */
    if (pfs_info.pfs_sys_session) {
        DBprintk((" enabling psr.pp for %d\n", task->pid));
        ia64_psr(regs)->pp = pfs_info.pfs_pp;
        return 0;
    }

    /*
     * takes care of easiest case first
     */
    if (CTX_INHERIT_MODE(ctx) == PFM_FL_INHERIT_NONE) {
        DBprintk((" removing PFM context for %d\n", task->pid));
        task->thread.pfm_context     = NULL;
        task->thread.pfm_must_block  = 0;
        atomic_set(&task->thread.pfm_notifiers_check, 0);
        /* copy_thread() clears IA64_THREAD_PM_VALID */
        return 0;
    }
    nctx = pfm_context_alloc();
    if (nctx == NULL) return -ENOMEM;

    /* copy content */
    *nctx = *ctx;

    if (CTX_INHERIT_MODE(ctx) == PFM_FL_INHERIT_ONCE) {
        nctx->ctx_fl_inherit = PFM_FL_INHERIT_NONE;
        atomic_set(&task->thread.pfm_notifiers_check, 0);
        DBprintk((" downgrading to INHERIT_NONE for %d\n", task->pid));
        pfs_info.pfs_proc_sessions++;
    }

    /* initialize counters in new context */
    for(i=0, cnum= PMU_FIRST_COUNTER; i < pmu_conf.max_counters; cnum++, i++) {
        nctx->ctx_pmds[i].val = nctx->ctx_pmds[i].ival & ~pmu_conf.perf_ovfl_val;
        th->pmd[cnum]          = nctx->ctx_pmds[i].ival & pmu_conf.perf_ovfl_val;

    }
    /* clear BTB index register */
    th->pmd[16] = 0;

    /* if sampling then increment number of users of buffer */
    if (nctx->ctx_smpl_buf) {
        atomic_inc(&nctx->ctx_smpl_buf->psb_refcnt);
    }

    nctx->ctx_fl_frozen = 0;
    nctx->ctx_ovfl_regs = 0;
    sema_init(&nctx->ctx_restart_sem, 0); /* reset this semaphore to locked */

    /* clear pending notification */
    th->pfm_must_block = 0;

    /* link with new task */
    th->pfm_context     = nctx;

    DBprintk((" nctx=%p for process %d\n", (void *)nctx, task->pid));

    /*
     * the copy_thread routine automatically clears
     * IA64_THREAD_PM_VALID, so we need to reenable it, if it was used by the caller
     */
    if (current->thread.flags & IA64_THREAD_PM_VALID) {
        DBprintk(("  setting PM_VALID for %d\n", task->pid));
        th->flags |= IA64_THREAD_PM_VALID;
    }

    return 0;
}

/* 
 * called from release_thread(), at this point this task is not in the 
 * tasklist anymore
 */
void
pfm_context_exit(struct task_struct *task)
{
    pfm_context_t *ctx = task->thread.pfm_context;

    if (!ctx) {
        DBprintk((" invalid context for %d\n", task->pid));
        return;
    }

    /* check is we have a sampling buffer attached */
    if (ctx->ctx_smpl_buf) {
        pfm_smpl_buffer_desc_t *psb = ctx->ctx_smpl_buf;

        /* if only user left, then remove */
        DBprintk((" [%d] [%d] psb->refcnt=%d\n", current->pid, task->pid, psb->psb_refcnt.counter));

        if (atomic_dec_and_test(&psb->psb_refcnt) ) {
            rvfree(psb->psb_hdr, psb->psb_size);
            vfree(psb);
            DBprintk((" [%d] cleaning [%d] sampling buffer\n", current->pid, task->pid ));
        }
    }
    DBprintk((" [%d] cleaning [%d] pfm_context @%p\n", current->pid, task->pid, (void *)ctx));

    /*
     * To avoid getting the notified task scan the entire process list
     * when it exits because it would have pfm_notifiers_check set, we 
     * decrease it by 1 to inform the task, that one less task is going
     * to send it notification. each new notifer increases this field by
     * 1 in pfm_context_create(). Of course, there is race condition between
     * decreasing the value and the notified task exiting. The danger comes
     * from the fact that we have a direct pointer to its task structure
     * thereby bypassing the tasklist. We must make sure that if we have 
     * notify_task!= NULL, the target task is still somewhat present. It may
     * already be detached from the tasklist but that's okay. Note that it is
     * okay if we 'miss the deadline' and the task scans the list for nothing,
     * it will affect performance but not correctness. The correctness is ensured
     * by using the notify_lock whic prevents the notify_task from changing on us.
     * Once holdhing this lock, if we see notify_task!= NULL, then it will stay like
     * that until we release the lock. If it is NULL already then we came too late.
     */
    spin_lock(&ctx->ctx_notify_lock);

    if (ctx->ctx_notify_task) {
        DBprintk((" [%d] [%d] atomic_sub on [%d] notifiers=%u\n", current->pid, task->pid,
                    ctx->ctx_notify_task->pid, 
                    atomic_read(&ctx->ctx_notify_task->thread.pfm_notifiers_check)));

        atomic_sub(1, &ctx->ctx_notify_task->thread.pfm_notifiers_check);
    }

    spin_unlock(&ctx->ctx_notify_lock);

    if (ctx->ctx_fl_system) {
        /*
         * if included interrupts (true by default), then reset
         * to get default value
         */
        if (ctx->ctx_fl_exclintr == 0) {
            /*
             * reload kernel default DCR value
             */
            ia64_set_dcr(pfs_info.pfs_dfl_dcr);
            DBprintk((" restored dcr to 0x%lx\n", pfs_info.pfs_dfl_dcr));
        }
        /* 
         * free system wide session slot
         */
        pfs_info.pfs_sys_session = 0;
    } else {
        pfs_info.pfs_proc_sessions--;
    }

    pfm_context_free(ctx);
    /* 
     *  clean pfm state in thread structure,
     */
    task->thread.pfm_context    = NULL;
    task->thread.pfm_must_block = 0;
    /* pfm_notifiers is cleaned in pfm_cleanup_notifiers() */

}

void
pfm_cleanup_notifiers(struct task_struct *task)
{
    struct task_struct *p;
    pfm_context_t *ctx;

    DBprintk((" [%d] called\n", task->pid));

    read_lock(&tasklist_lock);

    for_each_task(p) {
        /*
         * It is safe to do the 2-step test here, because thread.ctx
         * is cleaned up only in release_thread() and at that point
         * the task has been detached from the tasklist which is an
         * operation which uses the write_lock() on the tasklist_lock
         * so it cannot run concurrently to this loop. So we have the
         * guarantee that if we find p and it has a perfmon ctx then
         * it is going to stay like this for the entire execution of this
         * loop.
         */
        ctx = p->thread.pfm_context;

        DBprintk((" [%d] scanning task [%d] ctx=%p\n", task->pid, p->pid, ctx));

        if (ctx && ctx->ctx_notify_task == task) {
            DBprintk((" trying for notifier %d in %d\n", task->pid, p->pid));
            /*
             * the spinlock is required to take care of a race condition
             * with the send_sig_info() call. We must make sure that 
             * either the send_sig_info() completes using a valid task,
             * or the notify_task is cleared before the send_sig_info()
             * can pick up a stale value. Note that by the time this
             * function is executed the 'task' is already detached from the
             * tasklist. The problem is that the notifiers have a direct
             * pointer to it. It is okay to send a signal to a task in this
             * stage, it simply will have no effect. But it is better than sending
             * to a completely destroyed task or worse to a new task using the same
             * task_struct address.
             */
            spin_lock(&ctx->ctx_notify_lock);

            ctx->ctx_notify_task = NULL;

            spin_unlock(&ctx->ctx_notify_lock);

            DBprintk((" done for notifier %d in %d\n", task->pid, p->pid));
        }
    }
    read_unlock(&tasklist_lock);

}

#else /* !CONFIG_PERFMON */

asmlinkage int
sys_perfmonctl (int pid, int cmd, int flags, perfmon_req_t *req, int count, long arg6, long arg7, long arg8, long stack)
{
    return -ENOSYS;
}

#endif /* !CONFIG_PERFMON */
:: Command execute ::
Enter:	Select: