[PATCH 04/23] AMD IOMMU: add branch hints to completion wait checks

Previous thread: [GIT PULL] please pull infiniband.git by Roland Dreier on Wednesday, September 17, 2008 - 9:40 am. (1 message)

Next thread: New IOCTLs by Singaravelan Nallasellan on Wednesday, September 17, 2008 - 9:55 am. (8 messages)
From: Joerg Roedel
Date: Wednesday, September 17, 2008 - 9:52 am

Hi all,

this is the patch series with all 2.6.28 updates I currently have for
the AMD IOMMU driver. The highlights in this series are the
implementation of lazy IOTLB flushing which increases performance and
code for AMD IOMMU event handling. Any events the hardware will send are
now visible. There are also a lot of minor fixes and cleanups which came
up during code review and LKML discussions from various persons. The
patches are relative to current tip/master branch.
Please review.

Here is the shortlog and the diffstat:

FUJITA Tomonori (1):
      AMD IOMMU: avoid unnecessary low zone allocation in alloc_coherent

Joerg Roedel (22):
      AMD IOMMU: check for invalid device pointers
      AMD IOMMU: move TLB flushing to the map/unmap helper functions
      AMD IOMMU: implement lazy IO/TLB flushing
      AMD IOMMU: add branch hints to completion wait checks
      AMD IOMMU: align alloc_coherent addresses properly
      AMD IOMMU: add event buffer allocation
      AMD IOMMU: save pci segment from ACPI tables
      AMD IOMMU: save pci_dev instead of devid
      AMD IOMMU: add MSI interrupt support
      AMD IOMMU: add event handling code
      AMD IOMMU: enable event logging
      AMD IOMMU: allow IO page faults from devices
      AMD IOMMU: add dma_supported callback
      AMD IOMMU: don't assing preallocated protection domains to devices
      AMD IOMMU: some set_device_domain cleanups
      AMD IOMMU: replace memset with __GFP_ZERO in alloc_coherent
      AMD IOMMU: simplify dma_mask_to_pages
      AMD IOMMU: free domain bitmap with its allocation order
      AMD IOMMU: remove unnecessary cast to u64 in the init code
      AMD IOMMU: calculate IVHD size with a function
      AMD IOMMU: use cmd_buf_size when freeing the command buffer
      add AMD IOMMU tree to MAINTAINERS file

 Documentation/kernel-parameters.txt |    5 +
 MAINTAINERS                         |    1 +
 arch/x86/Kconfig                    |    1 +
 arch/x86/kernel/amd_iommu.c         |  301 ...
From: Joerg Roedel
Date: Wednesday, September 17, 2008 - 9:52 am

This patch moves the invocation of the flushing functions to the
map/unmap helper functions because its common code in all dma_ops
relevant mapping/unmapping functions.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/amd_iommu.c |   19 +++++--------------
 1 files changed, 5 insertions(+), 14 deletions(-)

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 695e0fc..691e023 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -795,6 +795,9 @@ static dma_addr_t __map_single(struct device *dev,
 	}
 	address += offset;
 
+	if (unlikely(iommu_has_npcache(iommu)))
+		iommu_flush_pages(iommu, dma_dom->domain.id, address, size);
+
 out:
 	return address;
 }
@@ -825,6 +828,8 @@ static void __unmap_single(struct amd_iommu *iommu,
 	}
 
 	dma_ops_free_addresses(dma_dom, dma_addr, pages);
+
+	iommu_flush_pages(iommu, dma_dom->domain.id, dma_addr, size);
 }
 
 /*
@@ -853,9 +858,6 @@ static dma_addr_t map_single(struct device *dev, phys_addr_t paddr,
 	if (addr == bad_dma_address)
 		goto out;
 
-	if (iommu_has_npcache(iommu))
-		iommu_flush_pages(iommu, domain->id, addr, size);
-
 	if (iommu->need_sync)
 		iommu_completion_wait(iommu);
 
@@ -885,8 +887,6 @@ static void unmap_single(struct device *dev, dma_addr_t dma_addr,
 
 	__unmap_single(iommu, domain->priv, dma_addr, size, dir);
 
-	iommu_flush_pages(iommu, domain->id, dma_addr, size);
-
 	if (iommu->need_sync)
 		iommu_completion_wait(iommu);
 
@@ -948,9 +948,6 @@ static int map_sg(struct device *dev, struct scatterlist *sglist,
 			mapped_elems++;
 		} else
 			goto unmap;
-		if (iommu_has_npcache(iommu))
-			iommu_flush_pages(iommu, domain->id, s->dma_address,
-					  s->dma_length);
 	}
 
 	if (iommu->need_sync)
@@ -996,8 +993,6 @@ static void unmap_sg(struct device *dev, struct scatterlist *sglist,
 	for_each_sg(sglist, s, nelems, i) {
 		__unmap_single(iommu, domain->priv, s->dma_address,
 			       s->dma_length, ...
From: Joerg Roedel
Date: Wednesday, September 17, 2008 - 9:52 am

This patch adds the allocation of a event buffer for each AMD IOMMU in
the system. The hardware will log events like device page faults or
other errors to this buffer once this is enabled.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/amd_iommu_init.c  |   29 +++++++++++++++++++++++++++++
 include/asm-x86/amd_iommu_types.h |    9 +++++++++
 2 files changed, 38 insertions(+), 0 deletions(-)

diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index da631ab..a902eb6 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -418,6 +418,30 @@ static void __init free_command_buffer(struct amd_iommu *iommu)
 	free_pages((unsigned long)iommu->cmd_buf, get_order(CMD_BUFFER_SIZE));
 }
 
+/* allocates the memory where the IOMMU will log its events to */
+static u8 * __init alloc_event_buffer(struct amd_iommu *iommu)
+{
+	u64 entry;
+	iommu->evt_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
+						get_order(EVT_BUFFER_SIZE));
+
+	if (iommu->evt_buf == NULL)
+		return NULL;
+
+	entry = (u64)virt_to_phys(iommu->evt_buf) | EVT_LEN_MASK;
+	memcpy_toio(iommu->mmio_base + MMIO_EVT_BUF_OFFSET,
+		    &entry, sizeof(entry));
+
+	iommu->evt_buf_size = EVT_BUFFER_SIZE;
+
+	return iommu->evt_buf;
+}
+
+static void __init free_event_buffer(struct amd_iommu *iommu)
+{
+	free_pages((unsigned long)iommu->evt_buf, get_order(EVT_BUFFER_SIZE));
+}
+
 /* sets a specific bit in the device table entry. */
 static void set_dev_entry_bit(u16 devid, u8 bit)
 {
@@ -623,6 +647,7 @@ static int __init init_iommu_devices(struct amd_iommu *iommu)
 static void __init free_iommu_one(struct amd_iommu *iommu)
 {
 	free_command_buffer(iommu);
+	free_event_buffer(iommu);
 	iommu_unmap_mmio_space(iommu);
 }
 
@@ -662,6 +687,10 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
 	if (!iommu->cmd_buf)
 		return -ENOMEM;
 
+	iommu->evt_buf = alloc_event_buffer(iommu);
+	if ...
From: Joerg Roedel
Date: Wednesday, September 17, 2008 - 9:52 am

From: Joerg Roedel <jroedel@lemmy.amd.com>

The command buffer release function uses the CMD_BUF_SIZE macro for
get_order. Replace this with iommu->cmd_buf_size which is more reliable
about the actual size of the buffer.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/amd_iommu_init.c |    3 ++-
 1 files changed, 2 insertions(+), 1 deletions(-)

diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 5303404..d0ea4d3 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -434,7 +434,8 @@ static u8 * __init alloc_command_buffer(struct amd_iommu *iommu)
 
 static void __init free_command_buffer(struct amd_iommu *iommu)
 {
-	free_pages((unsigned long)iommu->cmd_buf, get_order(CMD_BUFFER_SIZE));
+	free_pages((unsigned long)iommu->cmd_buf,
+		   get_order(iommu->cmd_buf_size));
 }
 
 /* allocates the memory where the IOMMU will log its events to */
-- 
1.5.6.4


--

From: Joerg Roedel
Date: Wednesday, September 17, 2008 - 9:52 am

This patch adds branch hints to the cecks if a completion_wait is
necessary. The completion_waits in the mapping paths are unlikly because
they will only happen on software implementations of AMD IOMMU which
don't exists today or with lazy IO/TLB flushing when the allocator wraps
around the address space. With lazy IO/TLB flushing the completion_wait
in the unmapping path is unlikely too.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/amd_iommu.c |   12 ++++++------
 1 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 0569098..7e9e4e7 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -876,7 +876,7 @@ static dma_addr_t map_single(struct device *dev, phys_addr_t paddr,
 	if (addr == bad_dma_address)
 		goto out;
 
-	if (iommu->need_sync)
+	if (unlikely(iommu->need_sync))
 		iommu_completion_wait(iommu);
 
 out:
@@ -905,7 +905,7 @@ static void unmap_single(struct device *dev, dma_addr_t dma_addr,
 
 	__unmap_single(iommu, domain->priv, dma_addr, size, dir);
 
-	if (iommu->need_sync)
+	if (unlikely(iommu->need_sync))
 		iommu_completion_wait(iommu);
 
 	spin_unlock_irqrestore(&domain->lock, flags);
@@ -968,7 +968,7 @@ static int map_sg(struct device *dev, struct scatterlist *sglist,
 			goto unmap;
 	}
 
-	if (iommu->need_sync)
+	if (unlikely(iommu->need_sync))
 		iommu_completion_wait(iommu);
 
 out:
@@ -1014,7 +1014,7 @@ static void unmap_sg(struct device *dev, struct scatterlist *sglist,
 		s->dma_address = s->dma_length = 0;
 	}
 
-	if (iommu->need_sync)
+	if (unlikely(iommu->need_sync))
 		iommu_completion_wait(iommu);
 
 	spin_unlock_irqrestore(&domain->lock, flags);
@@ -1061,7 +1061,7 @@ static void *alloc_coherent(struct device *dev, size_t size,
 		goto out;
 	}
 
-	if (iommu->need_sync)
+	if (unlikely(iommu->need_sync))
 		iommu_completion_wait(iommu);
 
 out:
@@ -1093,7 +1093,7 @@ static void free_coherent(struct ...
From: Joerg Roedel
Date: Wednesday, September 17, 2008 - 9:52 am

We need the pci_dev later anyways to enable MSI for the IOMMU hardware.
So remove the devid pointing to the BDF and replace it with the pci_dev
structure where the IOMMU is implemented.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/amd_iommu_init.c  |   25 ++++++++++++++++---------
 include/asm-x86/amd_iommu_types.h |    5 +++--
 2 files changed, 19 insertions(+), 11 deletions(-)

diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index aa26e37..da619fc 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -243,9 +243,12 @@ static void __init iommu_feature_disable(struct amd_iommu *iommu, u8 bit)
 /* Function to enable the hardware */
 void __init iommu_enable(struct amd_iommu *iommu)
 {
-	printk(KERN_INFO "AMD IOMMU: Enabling IOMMU at ");
-	print_devid(iommu->devid, 0);
-	printk(" cap 0x%hx\n", iommu->cap_ptr);
+	printk(KERN_INFO "AMD IOMMU: Enabling IOMMU "
+	       "at %02x:%02x.%x cap 0x%hx\n",
+	       iommu->dev->bus->number,
+	       PCI_SLOT(iommu->dev->devfn),
+	       PCI_FUNC(iommu->dev->devfn),
+	       iommu->cap_ptr);
 
 	iommu_feature_enable(iommu, CONTROL_IOMMU_EN);
 }
@@ -512,15 +515,14 @@ static void __init set_device_exclusion_range(u16 devid, struct ivmd_header *m)
  */
 static void __init init_iommu_from_pci(struct amd_iommu *iommu)
 {
-	int bus = PCI_BUS(iommu->devid);
-	int dev = PCI_SLOT(iommu->devid);
-	int fn  = PCI_FUNC(iommu->devid);
 	int cap_ptr = iommu->cap_ptr;
 	u32 range;
 
-	iommu->cap = read_pci_config(bus, dev, fn, cap_ptr+MMIO_CAP_HDR_OFFSET);
+	pci_read_config_dword(iommu->dev, cap_ptr + MMIO_CAP_HDR_OFFSET,
+			      &iommu->cap);
+	pci_read_config_dword(iommu->dev, cap_ptr + MMIO_RANGE_OFFSET,
+			      &range);
 
-	range = read_pci_config(bus, dev, fn, cap_ptr+MMIO_RANGE_OFFSET);
 	iommu->first_device = calc_devid(MMIO_GET_BUS(range),
 					 MMIO_GET_FD(range));
 	iommu->last_device = calc_devid(MMIO_GET_BUS(range),
@@ -675,7 ...
From: Joerg Roedel
Date: Wednesday, September 17, 2008 - 9:52 am

From: Joerg Roedel <jroedel@lemmy.amd.com>

The ctrl variable is only u32 and readl also returns a 32 bit value. So
the cast to u64 is pointless. Remove it with this patch.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/amd_iommu_init.c |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index a307cf6..b876499 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -236,7 +236,7 @@ static void __init iommu_feature_disable(struct amd_iommu *iommu, u8 bit)
 {
 	u32 ctrl;
 
-	ctrl = (u64)readl(iommu->mmio_base + MMIO_CONTROL_OFFSET);
+	ctrl = readl(iommu->mmio_base + MMIO_CONTROL_OFFSET);
 	ctrl &= ~(1 << bit);
 	writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET);
 }
-- 
1.5.6.4


--

From: Joerg Roedel
Date: Wednesday, September 17, 2008 - 9:52 am

From: Joerg Roedel <jroedel@lemmy.amd.com>

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 MAINTAINERS |    1 +
 1 files changed, 1 insertions(+), 0 deletions(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index 38713f9..a649ea5 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -387,6 +387,7 @@ AMD IOMMU (AMD-VI)
 P:	Joerg Roedel
 M:	joerg.roedel@amd.com
 L:	iommu@lists.linux-foundation.org
+T:	git://git.kernel.org/pub/scm/linux/kernel/git/joro/linux-2.6-iommu.git
 S:	Supported
 
 AMD MICROCODE UPDATE SUPPORT
-- 
1.5.6.4


--

From: Joerg Roedel
Date: Wednesday, September 17, 2008 - 9:52 am

Remove some magic numbers and split the pte_root using standard
functions.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/amd_iommu.c       |    9 +++++----
 include/asm-x86/amd_iommu_types.h |    3 +++
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 09d5e92..8c97360 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -739,12 +739,13 @@ static void set_device_domain(struct amd_iommu *iommu,
 
 	u64 pte_root = virt_to_phys(domain->pt_root);
 
-	pte_root |= (domain->mode & 0x07) << 9;
-	pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | 2;
+	pte_root |= (domain->mode & DEV_ENTRY_MODE_MASK)
+		    << DEV_ENTRY_MODE_SHIFT;
+	pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | IOMMU_PTE_TV;
 
 	write_lock_irqsave(&amd_iommu_devtable_lock, flags);
-	amd_iommu_dev_table[devid].data[0] = pte_root;
-	amd_iommu_dev_table[devid].data[1] = pte_root >> 32;
+	amd_iommu_dev_table[devid].data[0] = lower_32_bits(pte_root);
+	amd_iommu_dev_table[devid].data[1] = upper_32_bits(pte_root);
 	amd_iommu_dev_table[devid].data[2] = domain->id;
 
 	amd_iommu_pd_table[devid] = domain;
diff --git a/include/asm-x86/amd_iommu_types.h b/include/asm-x86/amd_iommu_types.h
index 56dc6bd..b308586 100644
--- a/include/asm-x86/amd_iommu_types.h
+++ b/include/asm-x86/amd_iommu_types.h
@@ -130,6 +130,8 @@
 #define DEV_ENTRY_NMI_PASS      0xba
 #define DEV_ENTRY_LINT0_PASS    0xbe
 #define DEV_ENTRY_LINT1_PASS    0xbf
+#define DEV_ENTRY_MODE_MASK	0x07
+#define DEV_ENTRY_MODE_SHIFT	0x09
 
 /* constants to configure the command buffer */
 #define CMD_BUFFER_SIZE    8192
@@ -159,6 +161,7 @@
 #define IOMMU_MAP_SIZE_L3 (1ULL << 39)
 
 #define IOMMU_PTE_P  (1ULL << 0)
+#define IOMMU_PTE_TV (1ULL << 1)
 #define IOMMU_PTE_U  (1ULL << 59)
 #define IOMMU_PTE_FC (1ULL << 60)
 #define IOMMU_PTE_IR (1ULL << 61)
-- 
1.5.6.4


--

From: Joerg Roedel
Date: Wednesday, September 17, 2008 - 9:52 am

Currently AMD IOMMU code triggers a BUG_ON if NULL is passed as the
device. This is inconsistent with other IOMMU implementations. This
patch fixes it.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/amd_iommu.c |   43 +++++++++++++++++++++++++++++++++++--------
 1 files changed, 35 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 01c68c3..695e0fc 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -646,6 +646,18 @@ static void set_device_domain(struct amd_iommu *iommu,
  *****************************************************************************/
 
 /*
+ * This function checks if the driver got a valid device from the caller to
+ * avoid dereferencing invalid pointers.
+ */
+static bool check_device(struct device *dev)
+{
+	if (!dev || !dev->dma_mask)
+		return false;
+
+	return true;
+}
+
+/*
  * In the dma_ops path we only have the struct device. This function
  * finds the corresponding IOMMU, the protection domain and the
  * requestor id for a given device.
@@ -661,18 +673,19 @@ static int get_device_resources(struct device *dev,
 	struct pci_dev *pcidev;
 	u16 _bdf;
 
-	BUG_ON(!dev || dev->bus != &pci_bus_type || !dev->dma_mask);
+	*iommu = NULL;
+	*domain = NULL;
+	*bdf = 0xffff;
+
+	if (dev->bus != &pci_bus_type)
+		return 0;
 
 	pcidev = to_pci_dev(dev);
 	_bdf = calc_devid(pcidev->bus->number, pcidev->devfn);
 
 	/* device not translated by any IOMMU in the system? */
-	if (_bdf > amd_iommu_last_bdf) {
-		*iommu = NULL;
-		*domain = NULL;
-		*bdf = 0xffff;
+	if (_bdf > amd_iommu_last_bdf)
 		return 0;
-	}
 
 	*bdf = amd_iommu_alias_table[_bdf];
 
@@ -826,6 +839,9 @@ static dma_addr_t map_single(struct device *dev, phys_addr_t paddr,
 	u16 devid;
 	dma_addr_t addr;
 
+	if (!check_device(dev))
+		return bad_dma_address;
+
 	get_device_resources(dev, &iommu, &domain, &devid);
 
 	if (iommu == NULL || domain == NULL)
@@ ...
From: Joerg Roedel
Date: Wednesday, September 17, 2008 - 9:52 am

In isolation mode the protection domains for the devices are
preallocated and preassigned. This is bad if a device should be passed
to a virtualization guest because the IOMMU code does not know if it is
in use by a driver. This patch changes the code to assign the device to
the preallocated domain only if there were dma mapping requests for it.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/amd_iommu.c       |   43 ++++++++++++++++++++++++++++++++----
 include/asm-x86/amd_iommu_types.h |    6 +++++
 2 files changed, 44 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 0410be1..09d5e92 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -33,6 +33,10 @@
 
 static DEFINE_RWLOCK(amd_iommu_devtable_lock);
 
+/* A list of preallocated protection domains */
+static LIST_HEAD(iommu_pd_list);
+static DEFINE_SPINLOCK(iommu_pd_list_lock);
+
 /*
  * general struct to manage commands send to an IOMMU
  */
@@ -663,6 +667,7 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu,
 	dma_dom->next_bit = 0;
 
 	dma_dom->need_flush = false;
+	dma_dom->target_dev = 0xffff;
 
 	/* Intialize the exclusion range if necessary */
 	if (iommu->exclusion_start &&
@@ -769,6 +774,33 @@ static bool check_device(struct device *dev)
 }
 
 /*
+ * In this function the list of preallocated protection domains is traversed to
+ * find the domain for a specific device
+ */
+static struct dma_ops_domain *find_protection_domain(u16 devid)
+{
+	struct dma_ops_domain *entry, *ret = NULL;
+	unsigned long flags;
+
+	if (list_empty(&iommu_pd_list))
+		return NULL;
+
+	spin_lock_irqsave(&iommu_pd_list_lock, flags);
+
+	list_for_each_entry(entry, &iommu_pd_list, list) {
+		if (entry->target_dev == devid) {
+			ret = entry;
+			list_del(&ret->list);
+			break;
+		}
+	}
+
+	spin_unlock_irqrestore(&iommu_pd_list_lock, flags);
+
+	return ret;
+}
+
+/*
  * In the ...
From: Joerg Roedel
Date: Wednesday, September 17, 2008 - 9:52 am

From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>

x86's common alloc_coherent (dma_alloc_coherent in dma-mapping.h) sets
up the gfp flag according to the device dma_mask but AMD IOMMU doesn't
need it for devices that the IOMMU can do virtual mappings for. This
patch avoids unnecessary low zone allocation.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/amd_iommu.c |    5 +++--
 1 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 8c97360..963b8bb 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -1173,6 +1173,9 @@ static void *alloc_coherent(struct device *dev, size_t size,
 	if (!check_device(dev))
 		return NULL;
 
+	if (!get_device_resources(dev, &iommu, &domain, &devid))
+		flag &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
+
 	virt_addr = (void *)__get_free_pages(flag, get_order(size));
 	if (!virt_addr)
 		return 0;
@@ -1180,8 +1183,6 @@ static void *alloc_coherent(struct device *dev, size_t size,
 	memset(virt_addr, 0, size);
 	paddr = virt_to_phys(virt_addr);
 
-	get_device_resources(dev, &iommu, &domain, &devid);
-
 	if (!iommu || !domain) {
 		*dma_addr = (dma_addr_t)paddr;
 		return virt_addr;
-- 
1.5.6.4


--

From: Joerg Roedel
Date: Wednesday, September 17, 2008 - 9:52 am

The API definition for dma_alloc_coherent states that the bus address
has to be aligned to the next power of 2 boundary greater than the
allocation size. This is violated by AMD IOMMU so far and this patch
fixes it.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/amd_iommu.c |   22 ++++++++++++++--------
 1 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 7e9e4e7..fdec963 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -383,7 +383,8 @@ static unsigned long dma_mask_to_pages(unsigned long mask)
  */
 static unsigned long dma_ops_alloc_addresses(struct device *dev,
 					     struct dma_ops_domain *dom,
-					     unsigned int pages)
+					     unsigned int pages,
+					     unsigned long align_mask)
 {
 	unsigned long limit = dma_mask_to_pages(*dev->dma_mask);
 	unsigned long address;
@@ -400,10 +401,10 @@ static unsigned long dma_ops_alloc_addresses(struct device *dev,
 	}
 
 	address = iommu_area_alloc(dom->bitmap, limit, dom->next_bit, pages,
-			0 , boundary_size, 0);
+				   0 , boundary_size, align_mask);
 	if (address == -1) {
 		address = iommu_area_alloc(dom->bitmap, limit, 0, pages,
-				0, boundary_size, 0);
+				0, boundary_size, align_mask);
 		dom->need_flush = true;
 	}
 
@@ -787,17 +788,22 @@ static dma_addr_t __map_single(struct device *dev,
 			       struct dma_ops_domain *dma_dom,
 			       phys_addr_t paddr,
 			       size_t size,
-			       int dir)
+			       int dir,
+			       bool align)
 {
 	dma_addr_t offset = paddr & ~PAGE_MASK;
 	dma_addr_t address, start;
 	unsigned int pages;
+	unsigned long align_mask = 0;
 	int i;
 
 	pages = iommu_num_pages(paddr, size);
 	paddr &= PAGE_MASK;
 
-	address = dma_ops_alloc_addresses(dev, dma_dom, pages);
+	if (align)
+		align_mask = (1UL << get_order(size)) - 1;
+
+	address = dma_ops_alloc_addresses(dev, dma_dom, pages, align_mask);
 	if (unlikely(address ...
From: Joerg Roedel
Date: Wednesday, September 17, 2008 - 9:52 am

Remove the memset to 0 and use __GFP_ZERO at allocation time instead.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/amd_iommu.c |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 963b8bb..65aed6e 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -1176,11 +1176,11 @@ static void *alloc_coherent(struct device *dev, size_t size,
 	if (!get_device_resources(dev, &iommu, &domain, &devid))
 		flag &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
 
+	flag |= __GFP_ZERO;
 	virt_addr = (void *)__get_free_pages(flag, get_order(size));
 	if (!virt_addr)
 		return 0;
 
-	memset(virt_addr, 0, size);
 	paddr = virt_to_phys(virt_addr);
 
 	if (!iommu || !domain) {
-- 
1.5.6.4


--

From: Joerg Roedel
Date: Wednesday, September 17, 2008 - 9:52 am

The AMD IOMMU can generate interrupts for various reasons. This patch
adds the basic interrupt enabling infrastructure to the driver.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/Kconfig                  |    1 +
 arch/x86/kernel/amd_iommu.c       |   11 ++++
 arch/x86/kernel/amd_iommu_init.c  |   99 ++++++++++++++++++++++++++++++++++++-
 include/asm-x86/amd_iommu.h       |    3 +
 include/asm-x86/amd_iommu_types.h |    7 +++
 5 files changed, 120 insertions(+), 1 deletions(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 5258240..9c2898c 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -567,6 +567,7 @@ config CALGARY_IOMMU_ENABLED_BY_DEFAULT
 config AMD_IOMMU
 	bool "AMD IOMMU support"
 	select SWIOTLB
+	select PCI_MSI
 	depends on X86_64 && PCI && ACPI
 	help
 	  With this option you can enable support for AMD IOMMU hardware in
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index fdec963..3caef6b 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -51,6 +51,17 @@ static int iommu_has_npcache(struct amd_iommu *iommu)
 
 /****************************************************************************
  *
+ * Interrupt handling functions
+ *
+ ****************************************************************************/
+
+irqreturn_t amd_iommu_int_handler(int irq, void *data)
+{
+	return IRQ_NONE;
+}
+
+/****************************************************************************
+ *
  * IOMMU command queuing functions
  *
  ****************************************************************************/
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index da619fc..ce3130d 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -22,6 +22,8 @@
 #include <linux/gfp.h>
 #include <linux/list.h>
 #include <linux/sysdev.h>
+#include <linux/interrupt.h>
+#include <linux/msi.h>
 #include <asm/pci-direct.h>
 #include ...
From: Joerg Roedel
Date: Wednesday, September 17, 2008 - 9:52 am

From: Joerg Roedel <jroedel@lemmy.amd.com>

The current calculation of the IVHD entry size is hard to read. So move
this code to a seperate function to make it more clear what this
calculation does.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/amd_iommu_init.c |   12 ++++++++++--
 1 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index b876499..5303404 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -298,6 +298,14 @@ static void __init iommu_unmap_mmio_space(struct amd_iommu *iommu)
  ****************************************************************************/
 
 /*
+ * This function calculates the length of a given IVHD entry
+ */
+static inline int ivhd_entry_length(u8 *ivhd)
+{
+	return 0x04 << (*ivhd >> 6);
+}
+
+/*
  * This function reads the last device id the IOMMU has to handle from the PCI
  * capability header for this IOMMU
  */
@@ -341,7 +349,7 @@ static int __init find_last_devid_from_ivhd(struct ivhd_header *h)
 		default:
 			break;
 		}
-		p += 0x04 << (*p >> 6);
+		p += ivhd_entry_length(p);
 	}
 
 	WARN_ON(p != end);
@@ -642,7 +650,7 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
 			break;
 		}
 
-		p += 0x04 << (e->type >> 6);
+		p += ivhd_entry_length(p);
 	}
 }
 
-- 
1.5.6.4


--

From: Joerg Roedel
Date: Wednesday, September 17, 2008 - 9:52 am

The IO/TLB flushing on every unmaping operation is the most expensive
part there and not strictly necessary. It is sufficient to do the flush
before any entries are reused. This is patch implements lazy IO/TLB
flushing which does exactly this.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 Documentation/kernel-parameters.txt |    5 +++++
 arch/x86/kernel/amd_iommu.c         |   26 ++++++++++++++++++++++----
 arch/x86/kernel/amd_iommu_init.c    |   10 +++++++++-
 include/asm-x86/amd_iommu_types.h   |    9 +++++++++
 4 files changed, 45 insertions(+), 5 deletions(-)

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index c2e00ee..5f0aefe 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -284,6 +284,11 @@ and is between 256 and 4096 characters. It is defined in the file
 			isolate - enable device isolation (each device, as far
 			          as possible, will get its own protection
 			          domain)
+			unmap_flush - enable flushing of IO/TLB entries when
+			              they are unmapped. Otherwise they are
+				      flushed before they will be reused, which
+				      is a lot of faster
+
 	amd_iommu_size= [HW,X86-64]
 			Define the size of the aperture for the AMD IOMMU
 			driver. Possible values are:
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 691e023..0569098 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -203,6 +203,14 @@ static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid,
 	return 0;
 }
 
+/* Flush the whole IO/TLB for a given protection domain */
+static void iommu_flush_tlb(struct amd_iommu *iommu, u16 domid)
+{
+	u64 address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS;
+
+	iommu_queue_inv_iommu_pages(iommu, address, domid, 0, 1);
+}
+
 /****************************************************************************
  *
  * The functions below are used the create the page table mappings for
@@ ...
From: FUJITA Tomonori
Date: Wednesday, September 17, 2008 - 12:20 pm

On Wed, 17 Sep 2008 18:52:37 +0200

Would it be nice to have consistency of IOMMU parameters?

VT-d also has the kernel-boot option for this lazy flushing trick
though VT-d 'strict' option is more vague than 'unmap_flush'

It would be also nice to have consistency of IOMMU behavior.

VT-d enables the lazy flushing trick by default and has the boot
option to disable it.
--

From: Joerg Roedel
Date: Wednesday, September 17, 2008 - 12:28 pm

True. We should merge common parameters across IOMMUs into the
iommu= parameter some time in the future, I think. It would also be the

This is exactly what AMD IOMMU with this patch does too. The
amd_iommu=unmap_flush parameter disables lazy IO/TLB flushing.


Joerg
--

From: FUJITA Tomonori
Date: Wednesday, September 17, 2008 - 6:29 pm

On Wed, 17 Sep 2008 21:28:27 +0200

Hmm, now is better than the future? I think that now you can add
something like 'disable_batching_flush' as a common parameter and

Oops, I misread the description.
--

From: Joerg Roedel
Date: Thursday, September 18, 2008 - 3:13 am

Ok, I introduce a parameter called '[no]fullflush' like the GART already uses.
We can remove this parameter from gart_parse_options() later then.

Joerg

-- 
           |           AMD Saxony Limited Liability Company & Co. KG
 Operating |         Wilschdorfer Landstr. 101, 01109 Dresden, Germany
 System    |                  Register Court Dresden: HRA 4896
 Research  |              General Partner authorized to represent:
 Center    |             AMD Saxony LLC (Wilmington, Delaware, US)
           | General Manager of AMD Saxony LLC: Dr. Hans-R. Deppe, Thomas McCoy

--

From: Joerg Roedel
Date: Thursday, September 18, 2008 - 7:03 am

Ok, I queued the following patch in the AMD IOMMU updates and changed
this patch to use iommu_fullflush instead. Is this patch ok? It changes
the behavior of GART to use lazy flushing by default. But I don't think
that this is a problem.

commit 9769771290fddcfc0362c5d30242151d4eb1cc46
Author: Joerg Roedel <joerg.roedel@amd.com>
Date:   Thu Sep 18 15:23:43 2008 +0200

    x86: move GART TLB flushing options to generic code
    
    The GART currently implements the iommu=[no]fullflush command line
    parameters which influence its IO/TLB flushing strategy. This patch
    makes these parameters generic so that they can be used by the AMD IOMMU
    too.
    
    Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index c2e00ee..569527e 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -888,6 +888,10 @@ and is between 256 and 4096 characters. It is defined in the file
 		nomerge
 		forcesac
 		soft
+		fullflush
+			Flush IO/TLB at every deallocation
+		nofullflush
+			Flush IO/TLB only when addresses are reused (default)
 
 
 	intel_iommu=	[DMAR] Intel IOMMU driver (DMAR) option
diff --git a/Documentation/x86/x86_64/boot-options.txt b/Documentation/x86/x86_64/boot-options.txt
index 72ffb53..42c887b 100644
--- a/Documentation/x86/x86_64/boot-options.txt
+++ b/Documentation/x86/x86_64/boot-options.txt
@@ -229,8 +229,6 @@ IOMMU (input/output memory management unit)
   iommu options only relevant to the AMD GART hardware IOMMU:
     <size>             Set the size of the remapping area in bytes.
     allowed            Overwrite iommu off workarounds for specific chipsets.
-    fullflush          Flush IOMMU on each allocation (default).
-    nofullflush        Don't use IOMMU fullflush.
     leak               Turn on simple iommu leak tracing (only when
                        CONFIG_IOMMU_LEAK is on). Default number of leak pages
        ...
From: FUJITA Tomonori
Date: Thursday, September 18, 2008 - 4:10 pm

On Thu, 18 Sep 2008 16:03:50 +0200

I'm not sure about making 'nofullflush' a generic option. Enabling
nofullflush option doesn't change anything. So what's the point of the
option?
--

From: Joerg Roedel
Date: Thursday, September 18, 2008 - 11:29 pm

Backwards compatability with the GART code. These two options are
basically just moved from the GART code to pci-dma.c. But otherwise its
pointless, I can remove it if everybody else agrees.

Joerg

--

From: FUJITA Tomonori
Date: Friday, September 19, 2008 - 3:21 am

On Fri, 19 Sep 2008 08:29:46 +0200

The compatibility with the GART code doesn't mean that we need to have
this pointless option as a common option. You can make 'fullflush' a
common option and the meaningless 'nofullflush' can live in GART.
--

From: Joerg Roedel
Date: Friday, September 19, 2008 - 10:43 am

Hi All,

FUJITA mentioned that I forgot to discuss this patch with you guys, the
implementers and maintainers for Intel VT-d and Calgary IOMMU drivers. I
would like to hear your opinion on that patch. He is right with that.
The patch is already in tip/iommu but can easily be reverted if there
are fundamental objections.
The patch basically moves the iommu=fullflush and iommu=nofullflush
option from the GART code to pci-dma.c. So we can use these parameters
in other IOMMU implementations too. Since Intel VT-d is implementing
also lazy IO/TLB flushing it would benefit from this generic parameter
too. I am not so sure about Calgary, but we have other parameters for
iommu= which doesn't affect all hardware IOMMUs.

Joerg

--

From: FUJITA Tomonori
Date: Friday, September 19, 2008 - 11:40 am

On Fri, 19 Sep 2008 19:43:39 +0200

nofullflush is pointless. It doesn't change any kernel behavior. Yes,
GART has it and we can't remove it because we exported it to
users. But please don't add such pointless option to the generic
options just for GART compatibility.

fullflush might be useful as a generic option to disable lazy IOTLB
flushing. But I'm not sure that Calgary uses it. VT-d already has
'strict' option for it and we can't change it. If VT-d wants to
support both 'strict' and 'fullflush' for disabling lazy IOTLB
flushing, it would make sense. But if VT-d doesn't want, it is useful
only for GART and AMD IOMMU. If so, I don't think that it is very
useful but I'm not against adding it.
--

From: Joerg Roedel
Date: Friday, September 19, 2008 - 12:27 pm

I think maybe we can remove it completly. It doesn't change any behavior
for GART too and as an unknown option it will be silently ignored by the
command line parser. So removing this option completly will not break

I think it will be usefull for VT-d too. As Ingo stated some time ago,
it makes sense to have a common set of options independent of the
specific type of hardware IOMMU in the system. And all hardware IOMMUs
for x86 implement lazy IO/TLB flushing. Only Calgary does not allow to
disable it. So iommu=fullflush is a good candidate for a common
parameter.

Joerg

--

From: Keshavamurthy, Anil S
Date: Friday, September 19, 2008 - 11:47 am

-----Original Message-----
From: Joerg Roedel [mailto:joro@8bytes.org] 
Sent: Friday, September 19, 2008 10:44 AM
To: Raj, Ashok; Li, Shaohua; Keshavamurthy, Anil S; muli@il.ibm.com
Cc: Joerg Roedel; FUJITA Tomonori; iommu@lists.linux-foundation.org;
linux-kernel@vger.kernel.org; Ingo Molnar
Subject: Re: [PATCH 03/23] AMD IOMMU: implement lazy IO/TLB flushing

Hi All,

FUJITA mentioned that I forgot to discuss this patch with you guys, the
implementers and maintainers for Intel VT-d and Calgary IOMMU drivers. I
would like to hear your opinion on that patch. He is right with that.
The patch is already in tip/iommu but can easily be reverted if there
are fundamental objections.
The patch basically moves the iommu=fullflush and iommu=nofullflush
option from the GART code to pci-dma.c. So we can use these parameters
in other IOMMU implementations too. Since Intel VT-d is implementing
also lazy IO/TLB flushing it would benefit from this generic parameter
too. I am not so sure about Calgary, but we have other parameters for
iommu= which doesn't affect all hardware IOMMUs.

Joerg

--

From: Muli Ben-Yehuda
Date: Saturday, September 20, 2008 - 10:27 pm

Calgary can't use fullflush, but in general the more unified our IOMMU
options, the better for the users. It's a pain to have to remember
which option applies to which IOMMU implementation.

Cheers,
Muli
-- 
Workshop on I/O Virtualization (WIOV '08)
Co-located with OSDI '08, Dec 2008, San Diego, CA
http://www.usenix.org/wiov08
--

From: Joerg Roedel
Date: Wednesday, September 17, 2008 - 9:52 am

This function determines if the AMD IOMMU implementation is responsible
for a given device. So the DMA layer can get this information from the
driver.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/amd_iommu.c |   25 +++++++++++++++++++++++++
 1 files changed, 25 insertions(+), 0 deletions(-)

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 71e20ad..0410be1 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -1205,6 +1205,30 @@ free_mem:
 }
 
 /*
+ * This function is called by the DMA layer to find out if we can handle a
+ * particular device. It is part of the dma_ops.
+ */
+static int amd_iommu_dma_supported(struct device *dev, u64 mask)
+{
+	u16 bdf;
+	struct pci_dev *pcidev;
+
+	/* No device or no PCI device */
+	if (!dev || dev->bus != &pci_bus_type)
+		return 0;
+
+	pcidev = to_pci_dev(dev);
+
+	bdf = calc_devid(pcidev->bus->number, pcidev->devfn);
+
+	/* Out of our scope? */
+	if (bdf > amd_iommu_last_bdf)
+		return 0;
+
+	return 1;
+}
+
+/*
  * The function for pre-allocating protection domains.
  *
  * If the driver core informs the DMA layer if a driver grabs a device
@@ -1247,6 +1271,7 @@ static struct dma_mapping_ops amd_iommu_dma_ops = {
 	.unmap_single = unmap_single,
 	.map_sg = map_sg,
 	.unmap_sg = unmap_sg,
+	.dma_supported = amd_iommu_dma_supported,
 };
 
 /*
-- 
1.5.6.4


--

From: Joerg Roedel
Date: Wednesday, September 17, 2008 - 9:52 am

The current calculation is very complicated. This patch replaces it with
a much simpler version.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/amd_iommu.c |    3 +--
 1 files changed, 1 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 65aed6e..e38719a 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -472,8 +472,7 @@ static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom,
  ****************************************************************************/
 static unsigned long dma_mask_to_pages(unsigned long mask)
 {
-	return (mask >> PAGE_SHIFT) +
-		(PAGE_ALIGN(mask & ~PAGE_MASK) >> PAGE_SHIFT);
+	return PAGE_ALIGN(mask) >> PAGE_SHIFT;
 }
 
 /*
-- 
1.5.6.4


--

From: FUJITA Tomonori
Date: Wednesday, September 17, 2008 - 12:20 pm

On Wed, 17 Sep 2008 18:52:52 +0200

I think that you can use iommu_device_max_index() for what
dma_mask_to_pages does.
--

From: Joerg Roedel
Date: Thursday, September 18, 2008 - 12:32 am

Hmm, in theory yes. But iommu_device_max_index() returns a size in bytes
and not in pages. So I still need to do the page shift somewhere. This
eliminates somewhat the benefit in this particular case.

Joerg

-- 
           |           AMD Saxony Limited Liability Company & Co. KG
 Operating |         Wilschdorfer Landstr. 101, 01109 Dresden, Germany
 System    |                  Register Court Dresden: HRA 4896
 Research  |              General Partner authorized to represent:
 Center    |             AMD Saxony LLC (Wilmington, Delaware, US)
           | General Manager of AMD Saxony LLC: Dr. Hans-R. Deppe, Thomas McCoy

--

From: FUJITA Tomonori
Date: Thursday, September 18, 2008 - 8:57 am

On Thu, 18 Sep 2008 09:32:32 +0200

It doesn't return a size in bytes. You could use like:

limit = iommu_device_max_index(dom->aperture_size >> PAGE_SHIFT, 0,
				dma_get_mask(dev) >> PAGE_SHIFT);


Anyway, it's not urgent at all. I'll send a patch for this after your
patchset is merged.
--

From: Joerg Roedel
Date: Thursday, September 18, 2008 - 9:39 am

Hmm, ok, makes sense if we remove this line too:

	limit = limit < size ? limit : size;


Ok, fine.

Joerg

-- 
           |           AMD Saxony Limited Liability Company & Co. KG
 Operating |         Wilschdorfer Landstr. 101, 01109 Dresden, Germany
 System    |                  Register Court Dresden: HRA 4896
 Research  |              General Partner authorized to represent:
 Center    |             AMD Saxony LLC (Wilmington, Delaware, US)
           | General Manager of AMD Saxony LLC: Dr. Hans-R. Deppe, Thomas McCoy

--

From: Joerg Roedel
Date: Wednesday, September 17, 2008 - 9:52 am

This patch adds the pci_seg field to the amd_iommu structure and fills
it with the corresponding value from the ACPI table.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/amd_iommu_init.c  |    1 +
 include/asm-x86/amd_iommu_types.h |    3 +++
 2 files changed, 4 insertions(+), 0 deletions(-)

diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index a902eb6..aa26e37 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -677,6 +677,7 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
 	 */
 	iommu->devid = h->devid;
 	iommu->cap_ptr = h->cap_ptr;
+	iommu->pci_seg = h->pci_seg;
 	iommu->mmio_phys = h->mmio_phys;
 	iommu->mmio_base = iommu_map_mmio_space(h->mmio_phys);
 	if (!iommu->mmio_base)
diff --git a/include/asm-x86/amd_iommu_types.h b/include/asm-x86/amd_iommu_types.h
index eabc316..eb4d47a 100644
--- a/include/asm-x86/amd_iommu_types.h
+++ b/include/asm-x86/amd_iommu_types.h
@@ -232,6 +232,9 @@ struct amd_iommu {
 	/* capabilities of that IOMMU read from ACPI */
 	u32 cap;
 
+	/* pci domain of this IOMMU */
+	u16 pci_seg;
+
 	/* first device this IOMMU handles. read from PCI */
 	u16 first_device;
 	/* last device this IOMMU handles. read from PCI */
-- 
1.5.6.4


--

From: Joerg Roedel
Date: Wednesday, September 17, 2008 - 9:52 am

There is a bit in the device entry to suppress all IO page faults
generated by a device. This bit was set until now because there was no
event logging. Now that there is event logging this patch allows IO page
faults from devices to see them in the kernel log too.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/amd_iommu_init.c |    1 -
 1 files changed, 0 insertions(+), 1 deletions(-)

diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 5182132..d8c1a45 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -951,7 +951,6 @@ static void init_device_table(void)
 	for (devid = 0; devid <= amd_iommu_last_bdf; ++devid) {
 		set_dev_entry_bit(devid, DEV_ENTRY_VALID);
 		set_dev_entry_bit(devid, DEV_ENTRY_TRANSLATION);
-		set_dev_entry_bit(devid, DEV_ENTRY_NO_PAGE_FAULT);
 	}
 }
 
-- 
1.5.6.4


--

From: Joerg Roedel
Date: Wednesday, September 17, 2008 - 9:52 am

This patch adds code for polling and printing out events generated by
the AMD IOMMU.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/amd_iommu.c       |   87 ++++++++++++++++++++++++++++++++++++-
 arch/x86/kernel/amd_iommu_init.c  |    1 -
 include/asm-x86/amd_iommu_types.h |   22 +++++++++
 3 files changed, 108 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 3caef6b..71e20ad 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -55,9 +55,94 @@ static int iommu_has_npcache(struct amd_iommu *iommu)
  *
  ****************************************************************************/
 
+static void iommu_print_event(void *__evt)
+{
+	u32 *event = __evt;
+	int type  = (event[1] >> EVENT_TYPE_SHIFT)  & EVENT_TYPE_MASK;
+	int devid = (event[0] >> EVENT_DEVID_SHIFT) & EVENT_DEVID_MASK;
+	int domid = (event[1] >> EVENT_DOMID_SHIFT) & EVENT_DOMID_MASK;
+	int flags = (event[1] >> EVENT_FLAGS_SHIFT) & EVENT_FLAGS_MASK;
+	u64 address = (u64)(((u64)event[3]) << 32) | event[2];
+
+	printk(KERN_ERR "AMD IOMMU: Event logged [");
+
+	switch (type) {
+	case EVENT_TYPE_ILL_DEV:
+		printk("ILLEGAL_DEV_TABLE_ENTRY device=%02x:%02x.%x "
+		       "address=0x%016llx flags=0x%04x]\n",
+		       PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
+		       address, flags);
+		break;
+	case EVENT_TYPE_IO_FAULT:
+		printk("IO_PAGE_FAULT device=%02x:%02x.%x "
+		       "domain=0x%04x address=0x%016llx flags=0x%04x]\n",
+		       PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
+		       domid, address, flags);
+		break;
+	case EVENT_TYPE_DEV_TAB_ERR:
+		printk("DEV_TAB_HARDWARE_ERROR device=%02x:%02x.%x "
+		       "address=0x%016llx flags=0x%04x]\n",
+		       PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
+		       address, flags);
+		break;
+	case EVENT_TYPE_PAGE_TAB_ERR:
+		printk("PAGE_TAB_HARDWARE_ERROR device=%02x:%02x.%x "
+		       "domain=0x%04x address=0x%016llx ...
From: Joerg Roedel
Date: Wednesday, September 17, 2008 - 9:52 am

The amd_iommu_pd_alloc_bitmap is allocated with a calculated order and
freed with order 1. This is not a bug since the calculated order always
evaluates to 1, but its unclean code. So replace the 1 with the
calculation in the release path.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/amd_iommu_init.c |    3 ++-
 1 files changed, 2 insertions(+), 1 deletions(-)

diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index d8c1a45..a307cf6 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -1145,7 +1145,8 @@ out:
 	return ret;
 
 free:
-	free_pages((unsigned long)amd_iommu_pd_alloc_bitmap, 1);
+	free_pages((unsigned long)amd_iommu_pd_alloc_bitmap,
+		   get_order(MAX_DOMAIN_ID/8));
 
 	free_pages((unsigned long)amd_iommu_pd_table,
 		   get_order(rlookup_table_size));
-- 
1.5.6.4


--

From: Joerg Roedel
Date: Wednesday, September 17, 2008 - 9:52 am

The code to log IOMMU events is in place now. So enable event logging
with this patch.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/amd_iommu_init.c |    8 ++++++++
 1 files changed, 8 insertions(+), 0 deletions(-)

diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 07709a9..5182132 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -254,6 +254,13 @@ void __init iommu_enable(struct amd_iommu *iommu)
 	iommu_feature_enable(iommu, CONTROL_IOMMU_EN);
 }
 
+/* Function to enable IOMMU event logging and event interrupts */
+void __init iommu_enable_event_logging(struct amd_iommu *iommu)
+{
+	iommu_feature_enable(iommu, CONTROL_EVT_LOG_EN);
+	iommu_feature_enable(iommu, CONTROL_EVT_INT_EN);
+}
+
 /*
  * mapping and unmapping functions for the IOMMU MMIO space. Each AMD IOMMU in
  * the system has one.
@@ -959,6 +966,7 @@ static void __init enable_iommus(void)
 	list_for_each_entry(iommu, &amd_iommu_list, list) {
 		iommu_set_exclusion_range(iommu);
 		iommu_init_msi(iommu);
+		iommu_enable_event_logging(iommu);
 		iommu_enable(iommu);
 	}
 }
-- 
1.5.6.4


--

Previous thread: [GIT PULL] please pull infiniband.git by Roland Dreier on Wednesday, September 17, 2008 - 9:40 am. (1 message)

Next thread: New IOCTLs by Singaravelan Nallasellan on Wednesday, September 17, 2008 - 9:55 am. (8 messages)