summaryrefslogtreecommitdiff
path: root/drivers/xen
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/xen')
-rw-r--r--drivers/xen/Kconfig8
-rw-r--r--drivers/xen/balloon.c1
-rw-r--r--drivers/xen/events/events_base.c111
-rw-r--r--drivers/xen/events/events_fifo.c26
-rw-r--r--drivers/xen/evtchn.c2
-rw-r--r--drivers/xen/grant-table.c10
-rw-r--r--drivers/xen/platform-pci.c2
-rw-r--r--drivers/xen/privcmd.c407
-rw-r--r--drivers/xen/xen-pciback/conf_space.c19
-rw-r--r--drivers/xen/xen-pciback/conf_space_capability.c8
-rw-r--r--drivers/xen/xen-pciback/conf_space_header.c21
-rw-r--r--drivers/xen/xenbus/xenbus_dev_frontend.c4
-rw-r--r--drivers/xen/xenbus/xenbus_probe.c2
13 files changed, 502 insertions, 119 deletions
diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig
index d43153fec18e..d5989871dd5d 100644
--- a/drivers/xen/Kconfig
+++ b/drivers/xen/Kconfig
@@ -269,12 +269,12 @@ config XEN_PRIVCMD
disaggregated Xen setups this driver might be needed for other
domains, too.
-config XEN_PRIVCMD_IRQFD
- bool "Xen irqfd support"
+config XEN_PRIVCMD_EVENTFD
+ bool "Xen Ioeventfd and irqfd support"
depends on XEN_PRIVCMD && XEN_VIRTIO && EVENTFD
help
- Using the irqfd mechanism a virtio backend running in a daemon can
- speed up interrupt injection into a guest.
+ Using the ioeventfd / irqfd mechanism a virtio backend running in a
+ daemon can speed up interrupt delivery from / to a guest.
config XEN_ACPI_PROCESSOR
tristate "Xen ACPI processor"
diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c
index 586a1673459e..976c6cdf9ee6 100644
--- a/drivers/xen/balloon.c
+++ b/drivers/xen/balloon.c
@@ -94,7 +94,6 @@ static struct ctl_table balloon_table[] = {
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE,
},
- { }
};
#else
diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c
index 3bdd5b59661d..6de6b084ea60 100644
--- a/drivers/xen/events/events_base.c
+++ b/drivers/xen/events/events_base.c
@@ -33,6 +33,7 @@
#include <linux/slab.h>
#include <linux/irqnr.h>
#include <linux/pci.h>
+#include <linux/rcupdate.h>
#include <linux/spinlock.h>
#include <linux/cpuhotplug.h>
#include <linux/atomic.h>
@@ -96,6 +97,7 @@ enum xen_irq_type {
struct irq_info {
struct list_head list;
struct list_head eoi_list;
+ struct rcu_work rwork;
short refcnt;
u8 spurious_cnt;
u8 is_accounted;
@@ -147,22 +149,12 @@ const struct evtchn_ops *evtchn_ops;
static DEFINE_MUTEX(irq_mapping_update_lock);
/*
- * Lock protecting event handling loop against removing event channels.
- * Adding of event channels is no issue as the associated IRQ becomes active
- * only after everything is setup (before request_[threaded_]irq() the handler
- * can't be entered for an event, as the event channel will be unmasked only
- * then).
- */
-static DEFINE_RWLOCK(evtchn_rwlock);
-
-/*
* Lock hierarchy:
*
* irq_mapping_update_lock
- * evtchn_rwlock
- * IRQ-desc lock
- * percpu eoi_list_lock
- * irq_info->lock
+ * IRQ-desc lock
+ * percpu eoi_list_lock
+ * irq_info->lock
*/
static LIST_HEAD(xen_irq_list_head);
@@ -306,6 +298,22 @@ static void channels_on_cpu_inc(struct irq_info *info)
info->is_accounted = 1;
}
+static void delayed_free_irq(struct work_struct *work)
+{
+ struct irq_info *info = container_of(to_rcu_work(work), struct irq_info,
+ rwork);
+ unsigned int irq = info->irq;
+
+ /* Remove the info pointer only now, with no potential users left. */
+ set_info_for_irq(irq, NULL);
+
+ kfree(info);
+
+ /* Legacy IRQ descriptors are managed by the arch. */
+ if (irq >= nr_legacy_irqs())
+ irq_free_desc(irq);
+}
+
/* Constructors for packed IRQ information. */
static int xen_irq_info_common_setup(struct irq_info *info,
unsigned irq,
@@ -668,33 +676,36 @@ static void xen_irq_lateeoi_worker(struct work_struct *work)
eoi = container_of(to_delayed_work(work), struct lateeoi_work, delayed);
- read_lock_irqsave(&evtchn_rwlock, flags);
+ rcu_read_lock();
while (true) {
- spin_lock(&eoi->eoi_list_lock);
+ spin_lock_irqsave(&eoi->eoi_list_lock, flags);
info = list_first_entry_or_null(&eoi->eoi_list, struct irq_info,
eoi_list);
- if (info == NULL || now < info->eoi_time) {
- spin_unlock(&eoi->eoi_list_lock);
+ if (info == NULL)
+ break;
+
+ if (now < info->eoi_time) {
+ mod_delayed_work_on(info->eoi_cpu, system_wq,
+ &eoi->delayed,
+ info->eoi_time - now);
break;
}
list_del_init(&info->eoi_list);
- spin_unlock(&eoi->eoi_list_lock);
+ spin_unlock_irqrestore(&eoi->eoi_list_lock, flags);
info->eoi_time = 0;
xen_irq_lateeoi_locked(info, false);
}
- if (info)
- mod_delayed_work_on(info->eoi_cpu, system_wq,
- &eoi->delayed, info->eoi_time - now);
+ spin_unlock_irqrestore(&eoi->eoi_list_lock, flags);
- read_unlock_irqrestore(&evtchn_rwlock, flags);
+ rcu_read_unlock();
}
static void xen_cpu_init_eoi(unsigned int cpu)
@@ -709,16 +720,15 @@ static void xen_cpu_init_eoi(unsigned int cpu)
void xen_irq_lateeoi(unsigned int irq, unsigned int eoi_flags)
{
struct irq_info *info;
- unsigned long flags;
- read_lock_irqsave(&evtchn_rwlock, flags);
+ rcu_read_lock();
info = info_for_irq(irq);
if (info)
xen_irq_lateeoi_locked(info, eoi_flags & XEN_EOI_FLAG_SPURIOUS);
- read_unlock_irqrestore(&evtchn_rwlock, flags);
+ rcu_read_unlock();
}
EXPORT_SYMBOL_GPL(xen_irq_lateeoi);
@@ -732,6 +742,7 @@ static void xen_irq_init(unsigned irq)
info->type = IRQT_UNBOUND;
info->refcnt = -1;
+ INIT_RCU_WORK(&info->rwork, delayed_free_irq);
set_info_for_irq(irq, info);
/*
@@ -789,31 +800,18 @@ static int __must_check xen_allocate_irq_gsi(unsigned gsi)
static void xen_free_irq(unsigned irq)
{
struct irq_info *info = info_for_irq(irq);
- unsigned long flags;
if (WARN_ON(!info))
return;
- write_lock_irqsave(&evtchn_rwlock, flags);
-
if (!list_empty(&info->eoi_list))
lateeoi_list_del(info);
list_del(&info->list);
- set_info_for_irq(irq, NULL);
-
WARN_ON(info->refcnt > 0);
- write_unlock_irqrestore(&evtchn_rwlock, flags);
-
- kfree(info);
-
- /* Legacy IRQ descriptors are managed by the arch. */
- if (irq < nr_legacy_irqs())
- return;
-
- irq_free_desc(irq);
+ queue_rcu_work(system_wq, &info->rwork);
}
/* Not called for lateeoi events. */
@@ -1231,7 +1229,8 @@ static int bind_evtchn_to_irq_chip(evtchn_port_t evtchn, struct irq_chip *chip,
bind_evtchn_to_cpu(evtchn, 0, false);
} else {
struct irq_info *info = info_for_irq(irq);
- WARN_ON(info == NULL || info->type != IRQT_EVTCHN);
+ if (!WARN_ON(!info || info->type != IRQT_EVTCHN))
+ info->refcnt++;
}
out:
@@ -1704,14 +1703,21 @@ void handle_irq_for_port(evtchn_port_t port, struct evtchn_loop_ctrl *ctrl)
generic_handle_irq(irq);
}
-static int __xen_evtchn_do_upcall(void)
+int xen_evtchn_do_upcall(void)
{
struct vcpu_info *vcpu_info = __this_cpu_read(xen_vcpu);
int ret = vcpu_info->evtchn_upcall_pending ? IRQ_HANDLED : IRQ_NONE;
int cpu = smp_processor_id();
struct evtchn_loop_ctrl ctrl = { 0 };
- read_lock(&evtchn_rwlock);
+ /*
+ * When closing an event channel the associated IRQ must not be freed
+ * until all cpus have left the event handling loop. This is ensured
+ * by taking the rcu_read_lock() while handling events, as freeing of
+ * the IRQ is handled via queue_rcu_work() _after_ closing the event
+ * channel.
+ */
+ rcu_read_lock();
do {
vcpu_info->evtchn_upcall_pending = 0;
@@ -1724,7 +1730,7 @@ static int __xen_evtchn_do_upcall(void)
} while (vcpu_info->evtchn_upcall_pending);
- read_unlock(&evtchn_rwlock);
+ rcu_read_unlock();
/*
* Increment irq_epoch only now to defer EOIs only for
@@ -1735,24 +1741,7 @@ static int __xen_evtchn_do_upcall(void)
return ret;
}
-
-void xen_evtchn_do_upcall(struct pt_regs *regs)
-{
- struct pt_regs *old_regs = set_irq_regs(regs);
-
- irq_enter();
-
- __xen_evtchn_do_upcall();
-
- irq_exit();
- set_irq_regs(old_regs);
-}
-
-int xen_hvm_evtchn_do_upcall(void)
-{
- return __xen_evtchn_do_upcall();
-}
-EXPORT_SYMBOL_GPL(xen_hvm_evtchn_do_upcall);
+EXPORT_SYMBOL_GPL(xen_evtchn_do_upcall);
/* Rebind a new event channel to an existing irq. */
void rebind_evtchn_irq(evtchn_port_t evtchn, int irq)
diff --git a/drivers/xen/events/events_fifo.c b/drivers/xen/events/events_fifo.c
index ad9fe51d3fb3..655775db7caf 100644
--- a/drivers/xen/events/events_fifo.c
+++ b/drivers/xen/events/events_fifo.c
@@ -226,21 +226,20 @@ static bool evtchn_fifo_is_masked(evtchn_port_t port)
*/
static bool clear_masked_cond(volatile event_word_t *word)
{
- event_word_t new, old, w;
+ event_word_t new, old;
- w = *word;
+ old = *word;
do {
- if (!(w & (1 << EVTCHN_FIFO_MASKED)))
+ if (!(old & (1 << EVTCHN_FIFO_MASKED)))
return true;
- if (w & (1 << EVTCHN_FIFO_PENDING))
+ if (old & (1 << EVTCHN_FIFO_PENDING))
return false;
- old = w & ~(1 << EVTCHN_FIFO_BUSY);
+ old = old & ~(1 << EVTCHN_FIFO_BUSY);
new = old & ~(1 << EVTCHN_FIFO_MASKED);
- w = sync_cmpxchg(word, old, new);
- } while (w != old);
+ } while (!sync_try_cmpxchg(word, &old, new));
return true;
}
@@ -259,17 +258,16 @@ static void evtchn_fifo_unmask(evtchn_port_t port)
static uint32_t clear_linked(volatile event_word_t *word)
{
- event_word_t new, old, w;
+ event_word_t new, old;
- w = *word;
+ old = *word;
do {
- old = w;
- new = (w & ~((1 << EVTCHN_FIFO_LINKED)
- | EVTCHN_FIFO_LINK_MASK));
- } while ((w = sync_cmpxchg(word, old, new)) != old);
+ new = (old & ~((1 << EVTCHN_FIFO_LINKED)
+ | EVTCHN_FIFO_LINK_MASK));
+ } while (!sync_try_cmpxchg(word, &old, new));
- return w & EVTCHN_FIFO_LINK_MASK;
+ return old & EVTCHN_FIFO_LINK_MASK;
}
static void consume_one_event(unsigned cpu, struct evtchn_loop_ctrl *ctrl,
diff --git a/drivers/xen/evtchn.c b/drivers/xen/evtchn.c
index 9139a7364df5..59717628ca42 100644
--- a/drivers/xen/evtchn.c
+++ b/drivers/xen/evtchn.c
@@ -397,7 +397,7 @@ static int evtchn_bind_to_user(struct per_user_data *u, evtchn_port_t port,
if (rc < 0)
goto err;
- rc = bind_evtchn_to_irqhandler_lateeoi(port, evtchn_interrupt, 0,
+ rc = bind_evtchn_to_irqhandler_lateeoi(port, evtchn_interrupt, IRQF_SHARED,
u->name, evtchn);
if (rc < 0)
goto err;
diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c
index 35659bf70746..04a6b470b15d 100644
--- a/drivers/xen/grant-table.c
+++ b/drivers/xen/grant-table.c
@@ -427,16 +427,14 @@ EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access);
static int gnttab_end_foreign_access_ref_v1(grant_ref_t ref)
{
- u16 flags, nflags;
- u16 *pflags;
+ u16 *pflags = &gnttab_shared.v1[ref].flags;
+ u16 flags;
- pflags = &gnttab_shared.v1[ref].flags;
- nflags = *pflags;
+ flags = *pflags;
do {
- flags = nflags;
if (flags & (GTF_reading|GTF_writing))
return 0;
- } while ((nflags = sync_cmpxchg(pflags, flags, 0)) != flags);
+ } while (!sync_try_cmpxchg(pflags, &flags, 0));
return 1;
}
diff --git a/drivers/xen/platform-pci.c b/drivers/xen/platform-pci.c
index fcc819131572..544d3f9010b9 100644
--- a/drivers/xen/platform-pci.c
+++ b/drivers/xen/platform-pci.c
@@ -64,7 +64,7 @@ static uint64_t get_callback_via(struct pci_dev *pdev)
static irqreturn_t do_hvm_evtchn_intr(int irq, void *dev_id)
{
- return xen_hvm_evtchn_do_upcall();
+ return xen_evtchn_do_upcall();
}
static int xen_allocate_irq(struct pci_dev *pdev)
diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c
index f00ad5f5f1d4..1ce7f3c7a950 100644
--- a/drivers/xen/privcmd.c
+++ b/drivers/xen/privcmd.c
@@ -29,15 +29,18 @@
#include <linux/seq_file.h>
#include <linux/miscdevice.h>
#include <linux/moduleparam.h>
+#include <linux/virtio_mmio.h>
#include <asm/xen/hypervisor.h>
#include <asm/xen/hypercall.h>
#include <xen/xen.h>
+#include <xen/events.h>
#include <xen/privcmd.h>
#include <xen/interface/xen.h>
#include <xen/interface/memory.h>
#include <xen/interface/hvm/dm_op.h>
+#include <xen/interface/hvm/ioreq.h>
#include <xen/features.h>
#include <xen/page.h>
#include <xen/xen-ops.h>
@@ -782,6 +785,7 @@ static long privcmd_ioctl_mmap_resource(struct file *file,
goto out;
pages = vma->vm_private_data;
+
for (i = 0; i < kdata.num; i++) {
xen_pfn_t pfn =
page_to_xen_pfn(pages[i / XEN_PFN_PER_PAGE]);
@@ -838,7 +842,7 @@ out:
return rc;
}
-#ifdef CONFIG_XEN_PRIVCMD_IRQFD
+#ifdef CONFIG_XEN_PRIVCMD_EVENTFD
/* Irqfd support */
static struct workqueue_struct *irqfd_cleanup_wq;
static DEFINE_MUTEX(irqfds_lock);
@@ -935,7 +939,7 @@ static int privcmd_irqfd_assign(struct privcmd_irqfd *irqfd)
return -ENOMEM;
dm_op = kirqfd + 1;
- if (copy_from_user(dm_op, irqfd->dm_op, irqfd->size)) {
+ if (copy_from_user(dm_op, u64_to_user_ptr(irqfd->dm_op), irqfd->size)) {
ret = -EFAULT;
goto error_kfree;
}
@@ -1079,6 +1083,389 @@ static void privcmd_irqfd_exit(void)
destroy_workqueue(irqfd_cleanup_wq);
}
+
+/* Ioeventfd Support */
+#define QUEUE_NOTIFY_VQ_MASK 0xFFFF
+
+static DEFINE_MUTEX(ioreq_lock);
+static LIST_HEAD(ioreq_list);
+
+/* per-eventfd structure */
+struct privcmd_kernel_ioeventfd {
+ struct eventfd_ctx *eventfd;
+ struct list_head list;
+ u64 addr;
+ unsigned int addr_len;
+ unsigned int vq;
+};
+
+/* per-guest CPU / port structure */
+struct ioreq_port {
+ int vcpu;
+ unsigned int port;
+ struct privcmd_kernel_ioreq *kioreq;
+};
+
+/* per-guest structure */
+struct privcmd_kernel_ioreq {
+ domid_t dom;
+ unsigned int vcpus;
+ u64 uioreq;
+ struct ioreq *ioreq;
+ spinlock_t lock; /* Protects ioeventfds list */
+ struct list_head ioeventfds;
+ struct list_head list;
+ struct ioreq_port ports[0];
+};
+
+static irqreturn_t ioeventfd_interrupt(int irq, void *dev_id)
+{
+ struct ioreq_port *port = dev_id;
+ struct privcmd_kernel_ioreq *kioreq = port->kioreq;
+ struct ioreq *ioreq = &kioreq->ioreq[port->vcpu];
+ struct privcmd_kernel_ioeventfd *kioeventfd;
+ unsigned int state = STATE_IOREQ_READY;
+
+ if (ioreq->state != STATE_IOREQ_READY ||
+ ioreq->type != IOREQ_TYPE_COPY || ioreq->dir != IOREQ_WRITE)
+ return IRQ_NONE;
+
+ /*
+ * We need a barrier, smp_mb(), here to ensure reads are finished before
+ * `state` is updated. Since the lock implementation ensures that
+ * appropriate barrier will be added anyway, we can avoid adding
+ * explicit barrier here.
+ *
+ * Ideally we don't need to update `state` within the locks, but we do
+ * that here to avoid adding explicit barrier.
+ */
+
+ spin_lock(&kioreq->lock);
+ ioreq->state = STATE_IOREQ_INPROCESS;
+
+ list_for_each_entry(kioeventfd, &kioreq->ioeventfds, list) {
+ if (ioreq->addr == kioeventfd->addr + VIRTIO_MMIO_QUEUE_NOTIFY &&
+ ioreq->size == kioeventfd->addr_len &&
+ (ioreq->data & QUEUE_NOTIFY_VQ_MASK) == kioeventfd->vq) {
+ eventfd_signal(kioeventfd->eventfd, 1);
+ state = STATE_IORESP_READY;
+ break;
+ }
+ }
+ spin_unlock(&kioreq->lock);
+
+ /*
+ * We need a barrier, smp_mb(), here to ensure writes are finished
+ * before `state` is updated. Since the lock implementation ensures that
+ * appropriate barrier will be added anyway, we can avoid adding
+ * explicit barrier here.
+ */
+
+ ioreq->state = state;
+
+ if (state == STATE_IORESP_READY) {
+ notify_remote_via_evtchn(port->port);
+ return IRQ_HANDLED;
+ }
+
+ return IRQ_NONE;
+}
+
+static void ioreq_free(struct privcmd_kernel_ioreq *kioreq)
+{
+ struct ioreq_port *ports = kioreq->ports;
+ int i;
+
+ lockdep_assert_held(&ioreq_lock);
+
+ list_del(&kioreq->list);
+
+ for (i = kioreq->vcpus - 1; i >= 0; i--)
+ unbind_from_irqhandler(irq_from_evtchn(ports[i].port), &ports[i]);
+
+ kfree(kioreq);
+}
+
+static
+struct privcmd_kernel_ioreq *alloc_ioreq(struct privcmd_ioeventfd *ioeventfd)
+{
+ struct privcmd_kernel_ioreq *kioreq;
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma;
+ struct page **pages;
+ unsigned int *ports;
+ int ret, size, i;
+
+ lockdep_assert_held(&ioreq_lock);
+
+ size = struct_size(kioreq, ports, ioeventfd->vcpus);
+ kioreq = kzalloc(size, GFP_KERNEL);
+ if (!kioreq)
+ return ERR_PTR(-ENOMEM);
+
+ kioreq->dom = ioeventfd->dom;
+ kioreq->vcpus = ioeventfd->vcpus;
+ kioreq->uioreq = ioeventfd->ioreq;
+ spin_lock_init(&kioreq->lock);
+ INIT_LIST_HEAD(&kioreq->ioeventfds);
+
+ /* The memory for ioreq server must have been mapped earlier */
+ mmap_write_lock(mm);
+ vma = find_vma(mm, (unsigned long)ioeventfd->ioreq);
+ if (!vma) {
+ pr_err("Failed to find vma for ioreq page!\n");
+ mmap_write_unlock(mm);
+ ret = -EFAULT;
+ goto error_kfree;
+ }
+
+ pages = vma->vm_private_data;
+ kioreq->ioreq = (struct ioreq *)(page_to_virt(pages[0]));
+ mmap_write_unlock(mm);
+
+ size = sizeof(*ports) * kioreq->vcpus;
+ ports = kzalloc(size, GFP_KERNEL);
+ if (!ports) {
+ ret = -ENOMEM;
+ goto error_kfree;
+ }
+
+ if (copy_from_user(ports, u64_to_user_ptr(ioeventfd->ports), size)) {
+ ret = -EFAULT;
+ goto error_kfree_ports;
+ }
+
+ for (i = 0; i < kioreq->vcpus; i++) {
+ kioreq->ports[i].vcpu = i;
+ kioreq->ports[i].port = ports[i];
+ kioreq->ports[i].kioreq = kioreq;
+
+ ret = bind_evtchn_to_irqhandler_lateeoi(ports[i],
+ ioeventfd_interrupt, IRQF_SHARED, "ioeventfd",
+ &kioreq->ports[i]);
+ if (ret < 0)
+ goto error_unbind;
+ }
+
+ kfree(ports);
+
+ list_add_tail(&kioreq->list, &ioreq_list);
+
+ return kioreq;
+
+error_unbind:
+ while (--i >= 0)
+ unbind_from_irqhandler(irq_from_evtchn(ports[i]), &kioreq->ports[i]);
+error_kfree_ports:
+ kfree(ports);
+error_kfree:
+ kfree(kioreq);
+ return ERR_PTR(ret);
+}
+
+static struct privcmd_kernel_ioreq *
+get_ioreq(struct privcmd_ioeventfd *ioeventfd, struct eventfd_ctx *eventfd)
+{
+ struct privcmd_kernel_ioreq *kioreq;
+ unsigned long flags;
+
+ list_for_each_entry(kioreq, &ioreq_list, list) {
+ struct privcmd_kernel_ioeventfd *kioeventfd;
+
+ /*
+ * kioreq fields can be accessed here without a lock as they are
+ * never updated after being added to the ioreq_list.
+ */
+ if (kioreq->uioreq != ioeventfd->ioreq) {
+ continue;
+ } else if (kioreq->dom != ioeventfd->dom ||
+ kioreq->vcpus != ioeventfd->vcpus) {
+ pr_err("Invalid ioeventfd configuration mismatch, dom (%u vs %u), vcpus (%u vs %u)\n",
+ kioreq->dom, ioeventfd->dom, kioreq->vcpus,
+ ioeventfd->vcpus);
+ return ERR_PTR(-EINVAL);
+ }
+
+ /* Look for a duplicate eventfd for the same guest */
+ spin_lock_irqsave(&kioreq->lock, flags);
+ list_for_each_entry(kioeventfd, &kioreq->ioeventfds, list) {
+ if (eventfd == kioeventfd->eventfd) {
+ spin_unlock_irqrestore(&kioreq->lock, flags);
+ return ERR_PTR(-EBUSY);
+ }
+ }
+ spin_unlock_irqrestore(&kioreq->lock, flags);
+
+ return kioreq;
+ }
+
+ /* Matching kioreq isn't found, allocate a new one */
+ return alloc_ioreq(ioeventfd);
+}
+
+static void ioeventfd_free(struct privcmd_kernel_ioeventfd *kioeventfd)
+{
+ list_del(&kioeventfd->list);
+ eventfd_ctx_put(kioeventfd->eventfd);
+ kfree(kioeventfd);
+}
+
+static int privcmd_ioeventfd_assign(struct privcmd_ioeventfd *ioeventfd)
+{
+ struct privcmd_kernel_ioeventfd *kioeventfd;
+ struct privcmd_kernel_ioreq *kioreq;
+ unsigned long flags;
+ struct fd f;
+ int ret;
+
+ /* Check for range overflow */
+ if (ioeventfd->addr + ioeventfd->addr_len < ioeventfd->addr)
+ return -EINVAL;
+
+ /* Vhost requires us to support length 1, 2, 4, and 8 */
+ if (!(ioeventfd->addr_len == 1 || ioeventfd->addr_len == 2 ||
+ ioeventfd->addr_len == 4 || ioeventfd->addr_len == 8))
+ return -EINVAL;
+
+ /* 4096 vcpus limit enough ? */
+ if (!ioeventfd->vcpus || ioeventfd->vcpus > 4096)
+ return -EINVAL;
+
+ kioeventfd = kzalloc(sizeof(*kioeventfd), GFP_KERNEL);
+ if (!kioeventfd)
+ return -ENOMEM;
+
+ f = fdget(ioeventfd->event_fd);
+ if (!f.file) {
+ ret = -EBADF;
+ goto error_kfree;
+ }
+
+ kioeventfd->eventfd = eventfd_ctx_fileget(f.file);
+ fdput(f);
+
+ if (IS_ERR(kioeventfd->eventfd)) {
+ ret = PTR_ERR(kioeventfd->eventfd);
+ goto error_kfree;
+ }
+
+ kioeventfd->addr = ioeventfd->addr;
+ kioeventfd->addr_len = ioeventfd->addr_len;
+ kioeventfd->vq = ioeventfd->vq;
+
+ mutex_lock(&ioreq_lock);
+ kioreq = get_ioreq(ioeventfd, kioeventfd->eventfd);
+ if (IS_ERR(kioreq)) {
+ mutex_unlock(&ioreq_lock);
+ ret = PTR_ERR(kioreq);
+ goto error_eventfd;
+ }
+
+ spin_lock_irqsave(&kioreq->lock, flags);
+ list_add_tail(&kioeventfd->list, &kioreq->ioeventfds);
+ spin_unlock_irqrestore(&kioreq->lock, flags);
+
+ mutex_unlock(&ioreq_lock);
+
+ return 0;
+
+error_eventfd:
+ eventfd_ctx_put(kioeventfd->eventfd);
+
+error_kfree:
+ kfree(kioeventfd);
+ return ret;
+}
+
+static int privcmd_ioeventfd_deassign(struct privcmd_ioeventfd *ioeventfd)
+{
+ struct privcmd_kernel_ioreq *kioreq, *tkioreq;
+ struct eventfd_ctx *eventfd;
+ unsigned long flags;
+ int ret = 0;
+
+ eventfd = eventfd_ctx_fdget(ioeventfd->event_fd);
+ if (IS_ERR(eventfd))
+ return PTR_ERR(eventfd);
+
+ mutex_lock(&ioreq_lock);
+ list_for_each_entry_safe(kioreq, tkioreq, &ioreq_list, list) {
+ struct privcmd_kernel_ioeventfd *kioeventfd, *tmp;
+ /*
+ * kioreq fields can be accessed here without a lock as they are
+ * never updated after being added to the ioreq_list.
+ */
+ if (kioreq->dom != ioeventfd->dom ||
+ kioreq->uioreq != ioeventfd->ioreq ||
+ kioreq->vcpus != ioeventfd->vcpus)
+ continue;
+
+ spin_lock_irqsave(&kioreq->lock, flags);
+ list_for_each_entry_safe(kioeventfd, tmp, &kioreq->ioeventfds, list) {
+ if (eventfd == kioeventfd->eventfd) {
+ ioeventfd_free(kioeventfd);
+ spin_unlock_irqrestore(&kioreq->lock, flags);
+
+ if (list_empty(&kioreq->ioeventfds))
+ ioreq_free(kioreq);
+ goto unlock;
+ }
+ }
+ spin_unlock_irqrestore(&kioreq->lock, flags);
+ break;
+ }
+
+ pr_err("Ioeventfd isn't already assigned, dom: %u, addr: %llu\n",
+ ioeventfd->dom, ioeventfd->addr);
+ ret = -ENODEV;
+
+unlock:
+ mutex_unlock(&ioreq_lock);
+ eventfd_ctx_put(eventfd);
+
+ return ret;
+}
+
+static long privcmd_ioctl_ioeventfd(struct file *file, void __user *udata)
+{
+ struct privcmd_data *data = file->private_data;
+ struct privcmd_ioeventfd ioeventfd;
+
+ if (copy_from_user(&ioeventfd, udata, sizeof(ioeventfd)))
+ return -EFAULT;
+
+ /* No other flags should be set */
+ if (ioeventfd.flags & ~PRIVCMD_IOEVENTFD_FLAG_DEASSIGN)
+ return -EINVAL;
+
+ /* If restriction is in place, check the domid matches */
+ if (data->domid != DOMID_INVALID && data->domid != ioeventfd.dom)
+ return -EPERM;
+
+ if (ioeventfd.flags & PRIVCMD_IOEVENTFD_FLAG_DEASSIGN)
+ return privcmd_ioeventfd_deassign(&ioeventfd);
+
+ return privcmd_ioeventfd_assign(&ioeventfd);
+}
+
+static void privcmd_ioeventfd_exit(void)
+{
+ struct privcmd_kernel_ioreq *kioreq, *tmp;
+ unsigned long flags;
+
+ mutex_lock(&ioreq_lock);
+ list_for_each_entry_safe(kioreq, tmp, &ioreq_list, list) {
+ struct privcmd_kernel_ioeventfd *kioeventfd, *tmp;
+
+ spin_lock_irqsave(&kioreq->lock, flags);
+ list_for_each_entry_safe(kioeventfd, tmp, &kioreq->ioeventfds, list)
+ ioeventfd_free(kioeventfd);
+ spin_unlock_irqrestore(&kioreq->lock, flags);
+
+ ioreq_free(kioreq);
+ }
+ mutex_unlock(&ioreq_lock);
+}
#else
static inline long privcmd_ioctl_irqfd(struct file *file, void __user *udata)
{
@@ -1093,7 +1480,16 @@ static inline int privcmd_irqfd_init(void)
static inline void privcmd_irqfd_exit(void)
{
}
-#endif /* CONFIG_XEN_PRIVCMD_IRQFD */
+
+static inline long privcmd_ioctl_ioeventfd(struct file *file, void __user *udata)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline void privcmd_ioeventfd_exit(void)
+{
+}
+#endif /* CONFIG_XEN_PRIVCMD_EVENTFD */
static long privcmd_ioctl(struct file *file,
unsigned int cmd, unsigned long data)
@@ -1134,6 +1530,10 @@ static long privcmd_ioctl(struct file *file,
ret = privcmd_ioctl_irqfd(file, udata);
break;
+ case IOCTL_PRIVCMD_IOEVENTFD:
+ ret = privcmd_ioctl_ioeventfd(file, udata);
+ break;
+
default:
break;
}
@@ -1278,6 +1678,7 @@ err_privcmdbuf:
static void __exit privcmd_exit(void)
{
+ privcmd_ioeventfd_exit();
privcmd_irqfd_exit();
misc_deregister(&privcmd_dev);
misc_deregister(&xen_privcmdbuf_dev);
diff --git a/drivers/xen/xen-pciback/conf_space.c b/drivers/xen/xen-pciback/conf_space.c
index 059de92aea7d..d47eee6c5143 100644
--- a/drivers/xen/xen-pciback/conf_space.c
+++ b/drivers/xen/xen-pciback/conf_space.c
@@ -288,12 +288,6 @@ int xen_pcibk_get_interrupt_type(struct pci_dev *dev)
u16 val;
int ret = 0;
- err = pci_read_config_word(dev, PCI_COMMAND, &val);
- if (err)
- return err;
- if (!(val & PCI_COMMAND_INTX_DISABLE))
- ret |= INTERRUPT_TYPE_INTX;
-
/*
* Do not trust dev->msi(x)_enabled here, as enabling could be done
* bypassing the pci_*msi* functions, by the qemu.
@@ -316,6 +310,19 @@ int xen_pcibk_get_interrupt_type(struct pci_dev *dev)
if (val & PCI_MSIX_FLAGS_ENABLE)
ret |= INTERRUPT_TYPE_MSIX;
}
+
+ /*
+ * PCIe spec says device cannot use INTx if MSI/MSI-X is enabled,
+ * so check for INTx only when both are disabled.
+ */
+ if (!ret) {
+ err = pci_read_config_word(dev, PCI_COMMAND, &val);
+ if (err)
+ return err;
+ if (!(val & PCI_COMMAND_INTX_DISABLE))
+ ret |= INTERRUPT_TYPE_INTX;
+ }
+
return ret ?: INTERRUPT_TYPE_NONE;
}
diff --git a/drivers/xen/xen-pciback/conf_space_capability.c b/drivers/xen/xen-pciback/conf_space_capability.c
index 097316a74126..1948a9700c8f 100644
--- a/drivers/xen/xen-pciback/conf_space_capability.c
+++ b/drivers/xen/xen-pciback/conf_space_capability.c
@@ -236,10 +236,16 @@ static int msi_msix_flags_write(struct pci_dev *dev, int offset, u16 new_value,
return PCIBIOS_SET_FAILED;
if (new_value & field_config->enable_bit) {
- /* don't allow enabling together with other interrupt types */
+ /*
+ * Don't allow enabling together with other interrupt type, but do
+ * allow enabling MSI(-X) while INTx is still active to please Linuxes
+ * MSI(-X) startup sequence. It is safe to do, as according to PCI
+ * spec, device with enabled MSI(-X) shouldn't use INTx.
+ */
int int_type = xen_pcibk_get_interrupt_type(dev);
if (int_type == INTERRUPT_TYPE_NONE ||
+ int_type == INTERRUPT_TYPE_INTX ||
int_type == field_config->int_type)
goto write;
return PCIBIOS_SET_FAILED;
diff --git a/drivers/xen/xen-pciback/conf_space_header.c b/drivers/xen/xen-pciback/conf_space_header.c
index 981435103af1..fc0332645966 100644
--- a/drivers/xen/xen-pciback/conf_space_header.c
+++ b/drivers/xen/xen-pciback/conf_space_header.c
@@ -104,24 +104,9 @@ static int command_write(struct pci_dev *dev, int offset, u16 value, void *data)
pci_clear_mwi(dev);
}
- if (dev_data && dev_data->allow_interrupt_control) {
- if ((cmd->val ^ value) & PCI_COMMAND_INTX_DISABLE) {
- if (value & PCI_COMMAND_INTX_DISABLE) {
- pci_intx(dev, 0);
- } else {
- /* Do not allow enabling INTx together with MSI or MSI-X. */
- switch (xen_pcibk_get_interrupt_type(dev)) {
- case INTERRUPT_TYPE_NONE:
- pci_intx(dev, 1);
- break;
- case INTERRUPT_TYPE_INTX:
- break;
- default:
- return PCIBIOS_SET_FAILED;
- }
- }
- }
- }
+ if (dev_data && dev_data->allow_interrupt_control &&
+ ((cmd->val ^ value) & PCI_COMMAND_INTX_DISABLE))
+ pci_intx(dev, !(value & PCI_COMMAND_INTX_DISABLE));
cmd->val = value;
diff --git a/drivers/xen/xenbus/xenbus_dev_frontend.c b/drivers/xen/xenbus/xenbus_dev_frontend.c
index 0792fda49a15..6f56640092a9 100644
--- a/drivers/xen/xenbus/xenbus_dev_frontend.c
+++ b/drivers/xen/xenbus/xenbus_dev_frontend.c
@@ -82,7 +82,7 @@ struct read_buffer {
struct list_head list;
unsigned int cons;
unsigned int len;
- char msg[];
+ char msg[] __counted_by(len);
};
struct xenbus_file_priv {
@@ -195,7 +195,7 @@ static int queue_reply(struct list_head *queue, const void *data, size_t len)
if (len > XENSTORE_PAYLOAD_MAX)
return -EINVAL;
- rb = kmalloc(sizeof(*rb) + len, GFP_KERNEL);
+ rb = kmalloc(struct_size(rb, msg, len), GFP_KERNEL);
if (rb == NULL)
return -ENOMEM;
diff --git a/drivers/xen/xenbus/xenbus_probe.c b/drivers/xen/xenbus/xenbus_probe.c
index 639bf628389b..3205e5d724c8 100644
--- a/drivers/xen/xenbus/xenbus_probe.c
+++ b/drivers/xen/xenbus/xenbus_probe.c
@@ -1025,7 +1025,7 @@ static int __init xenbus_init(void)
if (err < 0) {
pr_err("xenstore_late_init couldn't bind irq err=%d\n",
err);
- return err;
+ goto out_error;
}
xs_init_irq = err;