From d20cfa47b7a0520f3d2e98a6fc49b28e69871d0c Mon Sep 17 00:00:00 2001
From: James Hughes <JamesH65@users.noreply.github.com>
Date: Tue, 14 Nov 2017 15:13:15 +0000
Subject: [PATCH] AXI performance monitor driver (#2222)

Uses the debugfs I/F to provide access to the AXI
bus performance monitors.

Requires the new mailbox peripheral access for access
to the VPU performance registers, system bus access
is done using direct register reads.

Signed-off-by: James Hughes <james.hughes@raspberrypi.org>

raspberrypi_axi_monitor: suppress warning

Suppress the following warning by casting the pointer to and uintptr_t
before to u32:

Signed-off-by: Matteo Croce <mcroce@redhat.com>
---
 drivers/perf/Kconfig                   |   8 +
 drivers/perf/Makefile                  |   1 +
 drivers/perf/raspberrypi_axi_monitor.c | 637 +++++++++++++++++++++++++
 3 files changed, 646 insertions(+)
 create mode 100644 drivers/perf/raspberrypi_axi_monitor.c

--- a/drivers/perf/Kconfig
+++ b/drivers/perf/Kconfig
@@ -137,6 +137,14 @@ config ARM_DMC620_PMU
 	  Support for PMU events monitoring on the ARM DMC-620 memory
 	  controller.
 
+config RPI_AXIPERF
+        depends on ARCH_BCM2835
+        tristate "RaspberryPi AXI Performance monitors"
+        default n
+        help
+          Say y if you want to use Raspberry Pi AXI performance monitors, m if
+          you want to build it as a module.
+
 source "drivers/perf/hisilicon/Kconfig"
 
 endmenu
--- a/drivers/perf/Makefile
+++ b/drivers/perf/Makefile
@@ -14,3 +14,4 @@ obj-$(CONFIG_THUNDERX2_PMU) += thunderx2
 obj-$(CONFIG_XGENE_PMU) += xgene_pmu.o
 obj-$(CONFIG_ARM_SPE_PMU) += arm_spe_pmu.o
 obj-$(CONFIG_ARM_DMC620_PMU) += arm_dmc620_pmu.o
+obj-$(CONFIG_RPI_AXIPERF) += raspberrypi_axi_monitor.o
--- /dev/null
+++ b/drivers/perf/raspberrypi_axi_monitor.c
@@ -0,0 +1,637 @@
+/*
+ * raspberrypi_axi_monitor.c
+ *
+ * Author: james.hughes@raspberrypi.org
+ *
+ * Raspberry Pi AXI performance counters.
+ *
+ * Copyright (C) 2017 Raspberry Pi Trading Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/debugfs.h>
+#include <linux/devcoredump.h>
+#include <linux/device.h>
+#include <linux/kthread.h>
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <linux/mutex.h>
+#include <linux/of.h>
+#include <linux/platform_device.h>
+
+#include <soc/bcm2835/raspberrypi-firmware.h>
+
+#define NUM_MONITORS 2
+#define NUM_BUS_WATCHERS_PER_MONITOR 3
+
+#define SYSTEM_MONITOR 0
+#define VPU_MONITOR 1
+
+#define MAX_BUSES 16
+#define DEFAULT_SAMPLE_TIME 100
+
+#define NUM_BUS_WATCHER_RESULTS 9
+
+struct bus_watcher_data {
+	union	{
+		u32 results[NUM_BUS_WATCHER_RESULTS];
+		struct {
+			u32 atrans;
+			u32 atwait;
+			u32 amax;
+			u32 wtrans;
+			u32 wtwait;
+			u32 wmax;
+			u32 rtrans;
+			u32 rtwait;
+			u32 rmax;
+		};
+	};
+};
+
+
+struct rpi_axiperf {
+	struct platform_device *dev;
+	struct dentry *root_folder;
+
+	struct task_struct *monitor_thread;
+	struct mutex lock;
+
+	struct rpi_firmware *firmware;
+
+	/* Sample time spent on for each bus */
+	int sample_time;
+
+	/* Now storage for the per monitor settings and the resulting
+	 * performance figures
+	 */
+	struct {
+		/* Bit field of buses we want to monitor */
+		int bus_enabled;
+		/* Bit field of buses to filter by */
+		int bus_filter;
+		/* The current buses being monitored on this monitor */
+		int current_bus[NUM_BUS_WATCHERS_PER_MONITOR];
+		/* The last bus monitored on this monitor */
+		int last_monitored;
+
+		/* Set true if this mailbox must use the mailbox interface
+		 * rather than access registers directly.
+		 */
+		int use_mailbox_interface;
+
+		/* Current result values */
+		struct bus_watcher_data results[MAX_BUSES];
+
+		struct dentry *debugfs_entry;
+		void __iomem *base_address;
+
+	}  monitor[NUM_MONITORS];
+
+};
+
+static struct rpi_axiperf *state;
+
+/* Two monitors, System and VPU, each with the following register sets.
+ * Each monitor can only monitor one bus at a time, so we time share them,
+ * giving each bus 100ms (default, settable via debugfs) of time on its
+ * associated monitor
+ * Record results from the three Bus watchers per monitor and push to the sysfs
+ */
+
+/* general registers */
+const int GEN_CTRL;
+
+const int GEN_CTL_ENABLE_BIT	= BIT(0);
+const int GEN_CTL_RESET_BIT	= BIT(1);
+
+/* Bus watcher registers */
+const int BW_PITCH		= 0x40;
+
+const int BW0_CTRL		= 0x40;
+const int BW1_CTRL		= 0x80;
+const int BW2_CTRL		= 0xc0;
+
+const int BW_ATRANS_OFFSET	= 0x04;
+const int BW_ATWAIT_OFFSET	= 0x08;
+const int BW_AMAX_OFFSET	= 0x0c;
+const int BW_WTRANS_OFFSET	= 0x10;
+const int BW_WTWAIT_OFFSET	= 0x14;
+const int BW_WMAX_OFFSET	= 0x18;
+const int BW_RTRANS_OFFSET	= 0x1c;
+const int BW_RTWAIT_OFFSET	= 0x20;
+const int BW_RMAX_OFFSET	= 0x24;
+
+const int BW_CTRL_RESET_BIT	= BIT(31);
+const int BW_CTRL_ENABLE_BIT	= BIT(30);
+const int BW_CTRL_ENABLE_ID_FILTER_BIT	= BIT(29);
+const int BW_CTRL_LIMIT_HALT_BIT	= BIT(28);
+
+const int BW_CTRL_SOURCE_SHIFT	= 8;
+const int BW_CTRL_SOURCE_MASK	= GENMASK(12, 8); // 5 bits
+const int BW_CTRL_BUS_WATCH_SHIFT;
+const int BW_CTRL_BUS_WATCH_MASK = GENMASK(5, 0); // 6 bits
+const int BW_CTRL_BUS_FILTER_SHIFT = 8;
+
+const static char *bus_filter_strings[] = {
+	"",
+	"CORE0_V",
+	"ICACHE0",
+	"DCACHE0",
+	"CORE1_V",
+	"ICACHE1",
+	"DCACHE1",
+	"L2_MAIN",
+	"HOST_PORT",
+	"HOST_PORT2",
+	"HVS",
+	"ISP",
+	"VIDEO_DCT",
+	"VIDEO_SD2AXI",
+	"CAM0",
+	"CAM1",
+	"DMA0",
+	"DMA1",
+	"DMA2_VPU",
+	"JPEG",
+	"VIDEO_CME",
+	"TRANSPOSER",
+	"VIDEO_FME",
+	"CCP2TX",
+	"USB",
+	"V3D0",
+	"V3D1",
+	"V3D2",
+	"AVE",
+	"DEBUG",
+	"CPU",
+	"M30"
+};
+
+const int num_bus_filters = ARRAY_SIZE(bus_filter_strings);
+
+const static char *system_bus_string[] = {
+	"DMA_L2",
+	"TRANS",
+	"JPEG",
+	"SYSTEM_UC",
+	"DMA_UC",
+	"SYSTEM_L2",
+	"CCP2TX",
+	"MPHI_RX",
+	"MPHI_TX",
+	"HVS",
+	"H264",
+	"ISP",
+	"V3D",
+	"PERIPHERAL",
+	"CPU_UC",
+	"CPU_L2"
+};
+
+const int num_system_buses = ARRAY_SIZE(system_bus_string);
+
+const static char *vpu_bus_string[] = {
+	"VPU1_D_L2",
+	"VPU0_D_L2",
+	"VPU1_I_L2",
+	"VPU0_I_L2",
+	"SYSTEM_L2",
+	"L2_FLUSH",
+	"DMA_L2",
+	"VPU1_D_UC",
+	"VPU0_D_UC",
+	"VPU1_I_UC",
+	"VPU0_I_UC",
+	"SYSTEM_UC",
+	"L2_OUT",
+	"DMA_UC",
+	"SDRAM",
+	"L2_IN"
+};
+
+const int num_vpu_buses = ARRAY_SIZE(vpu_bus_string);
+
+const static char *monitor_name[] = {
+	"System",
+	"VPU"
+};
+
+static inline void write_reg(int monitor, int reg, u32 value)
+{
+	writel(value, state->monitor[monitor].base_address + reg);
+}
+
+static inline u32 read_reg(int monitor, u32 reg)
+{
+	return readl(state->monitor[monitor].base_address + reg);
+}
+
+static void read_bus_watcher(int monitor, int watcher, u32 *results)
+{
+	if (state->monitor[monitor].use_mailbox_interface) {
+		/* We have 9 results, plus the overheads of start address and
+		 * length So 11 u32 to define
+		 */
+		u32 tmp[11];
+		int err;
+
+		tmp[0] = (u32)(uintptr_t)(state->monitor[monitor].base_address + watcher
+				+ BW_ATRANS_OFFSET);
+		tmp[1] = NUM_BUS_WATCHER_RESULTS;
+
+		err = rpi_firmware_property(state->firmware,
+					    RPI_FIRMWARE_GET_PERIPH_REG,
+					    tmp, sizeof(tmp));
+
+		if (err < 0 || tmp[1] != NUM_BUS_WATCHER_RESULTS)
+			dev_err_once(&state->dev->dev,
+				     "Failed to read bus watcher");
+		else
+			memcpy(results, &tmp[2],
+			       NUM_BUS_WATCHER_RESULTS * sizeof(u32));
+	} else {
+		int i;
+		void __iomem *addr = state->monitor[monitor].base_address
+				+ watcher + BW_ATRANS_OFFSET;
+		for (i = 0; i < NUM_BUS_WATCHER_RESULTS; i++, addr += 4)
+			*results++ = readl(addr);
+	}
+}
+
+static void set_monitor_control(int monitor, u32 set)
+{
+	if (state->monitor[monitor].use_mailbox_interface) {
+		u32 tmp[3] = {(u32)(uintptr_t)(state->monitor[monitor].base_address +
+				GEN_CTRL), 1, set};
+		int err = rpi_firmware_property(state->firmware,
+						RPI_FIRMWARE_SET_PERIPH_REG,
+						tmp, sizeof(tmp));
+
+		if (err < 0 || tmp[1] != 1)
+			dev_err_once(&state->dev->dev,
+				"Failed to set monitor control");
+	} else
+		write_reg(monitor, GEN_CTRL, set);
+}
+
+static void set_bus_watcher_control(int monitor, int watcher, u32 set)
+{
+	if (state->monitor[monitor].use_mailbox_interface) {
+		u32 tmp[3] = {(u32)(uintptr_t)(state->monitor[monitor].base_address +
+				    watcher), 1, set};
+		int err = rpi_firmware_property(state->firmware,
+						RPI_FIRMWARE_SET_PERIPH_REG,
+						tmp, sizeof(tmp));
+		if (err < 0 || tmp[1] != 1)
+			dev_err_once(&state->dev->dev,
+				"Failed to set bus watcher control");
+	} else
+		write_reg(monitor, watcher, set);
+}
+
+static void monitor(struct rpi_axiperf *state)
+{
+	int monitor, num_buses[NUM_MONITORS];
+
+	mutex_lock(&state->lock);
+
+	for (monitor = 0; monitor < NUM_MONITORS; monitor++) {
+		typeof(state->monitor[0]) *mon = &(state->monitor[monitor]);
+
+		/* Anything enabled? */
+		if (mon->bus_enabled == 0) {
+			/* No, disable all monitoring for this monitor */
+			set_monitor_control(monitor, GEN_CTL_RESET_BIT);
+		} else {
+			int i;
+
+			/* Find out how many busses we want to monitor, and
+			 * spread our 3 actual monitors over them
+			 */
+			num_buses[monitor] = hweight32(mon->bus_enabled);
+			num_buses[monitor] = min(num_buses[monitor],
+						 NUM_BUS_WATCHERS_PER_MONITOR);
+
+			for (i = 0; i < num_buses[monitor]; i++) {
+				int bus_control;
+
+				do {
+					mon->last_monitored++;
+					mon->last_monitored &= 0xf;
+				} while ((mon->bus_enabled &
+					 (1 << mon->last_monitored)) == 0);
+
+				mon->current_bus[i] = mon->last_monitored;
+
+				/* Reset the counters */
+				set_bus_watcher_control(monitor,
+							BW0_CTRL +
+							i*BW_PITCH,
+							BW_CTRL_RESET_BIT);
+
+				bus_control = BW_CTRL_ENABLE_BIT |
+						mon->current_bus[i];
+
+				if (mon->bus_filter) {
+					bus_control |=
+						BW_CTRL_ENABLE_ID_FILTER_BIT;
+					bus_control |=
+						((mon->bus_filter & 0x1f)
+						<< BW_CTRL_BUS_FILTER_SHIFT);
+				}
+
+				// Start capture
+				set_bus_watcher_control(monitor,
+							BW0_CTRL + i*BW_PITCH,
+							bus_control);
+			}
+		}
+
+		/* start monitoring */
+		set_monitor_control(monitor, GEN_CTL_ENABLE_BIT);
+	}
+
+	mutex_unlock(&state->lock);
+
+	msleep(state->sample_time);
+
+	/* Now read the results */
+
+	mutex_lock(&state->lock);
+	for (monitor = 0; monitor < NUM_MONITORS; monitor++) {
+		typeof(state->monitor[0]) *mon = &(state->monitor[monitor]);
+
+		/* Anything enabled? */
+		if (mon->bus_enabled == 0) {
+			/* No, disable all monitoring for this monitor */
+			set_monitor_control(monitor, 0);
+		} else {
+			int i;
+
+			for (i = 0; i < num_buses[monitor]; i++) {
+				int bus = mon->current_bus[i];
+
+				read_bus_watcher(monitor,
+					BW0_CTRL + i*BW_PITCH,
+					(u32 *)&mon->results[bus].results);
+			}
+		}
+	}
+	mutex_unlock(&state->lock);
+}
+
+static int monitor_thread(void *data)
+{
+	struct rpi_axiperf *state  = data;
+
+	while (1) {
+		monitor(state);
+
+		if (kthread_should_stop())
+			return 0;
+	}
+	return 0;
+}
+
+static ssize_t myreader(struct file *fp, char __user *user_buffer,
+			size_t count, loff_t *position)
+{
+#define INIT_BUFF_SIZE 2048
+
+	int i;
+	int idx = (int)(uintptr_t)(fp->private_data);
+	int num_buses, cnt;
+	char *string_buffer;
+	int buff_size = INIT_BUFF_SIZE;
+	char *p;
+	typeof(state->monitor[0]) *mon = &(state->monitor[idx]);
+
+	if (idx < 0 || idx > NUM_MONITORS)
+		idx = 0;
+
+	num_buses = idx == SYSTEM_MONITOR ? num_system_buses : num_vpu_buses;
+
+	string_buffer = kmalloc(buff_size, GFP_KERNEL);
+
+	if (!string_buffer) {
+		dev_err(&state->dev->dev,
+				"Failed temporary string allocation\n");
+		return 0;
+	}
+
+	p = string_buffer;
+
+	mutex_lock(&state->lock);
+
+	if (mon->bus_filter) {
+		int filt = min(mon->bus_filter & 0x1f, num_bus_filters);
+
+		cnt = snprintf(p, buff_size,
+			       "\nMonitoring transactions from %s only\n",
+			       bus_filter_strings[filt]);
+		p += cnt;
+		buff_size -= cnt;
+	}
+
+	cnt = snprintf(p, buff_size, "     Bus   |    Atrans    Atwait      AMax    Wtrans    Wtwait      WMax    Rtrans    Rtwait      RMax\n"
+				     "======================================================================================================\n");
+
+	if (cnt >= buff_size)
+		goto done;
+
+	p += cnt;
+	buff_size -= cnt;
+
+	for (i = 0; i < num_buses; i++) {
+		if (mon->bus_enabled & (1 << i)) {
+#define DIVIDER (1024)
+			typeof(mon->results[0]) *res = &(mon->results[i]);
+
+			cnt = snprintf(p, buff_size,
+					"%10s | %8uK %8uK %8uK %8uK %8uK %8uK %8uK %8uK %8uK\n",
+					idx == SYSTEM_MONITOR ?
+						system_bus_string[i] :
+						vpu_bus_string[i],
+					res->atrans/DIVIDER,
+					res->atwait/DIVIDER,
+					res->amax/DIVIDER,
+					res->wtrans/DIVIDER,
+					res->wtwait/DIVIDER,
+					res->wmax/DIVIDER,
+					res->rtrans/DIVIDER,
+					res->rtwait/DIVIDER,
+					res->rmax/DIVIDER
+					);
+			if (cnt >= buff_size)
+				goto done;
+
+			p += cnt;
+			buff_size -= cnt;
+		}
+	}
+
+	mutex_unlock(&state->lock);
+
+done:
+
+	/* did the last string entry exceeed our buffer size? ie out of string
+	 * buffer space. Null terminate, use what we have.
+	 */
+	if (cnt >= buff_size) {
+		buff_size = 0;
+		string_buffer[INIT_BUFF_SIZE] = 0;
+	}
+
+	cnt = simple_read_from_buffer(user_buffer, count, position,
+				      string_buffer,
+				      INIT_BUFF_SIZE - buff_size);
+
+	kfree(string_buffer);
+
+	return cnt;
+}
+
+static ssize_t mywriter(struct file *fp, const char __user *user_buffer,
+			size_t count, loff_t *position)
+{
+	int idx = (int)(uintptr_t)(fp->private_data);
+
+	if (idx < 0 || idx > NUM_MONITORS)
+		idx = 0;
+
+	/* At the moment, this does nothing, but in the future it could be
+	 * used to reset counters etc
+	 */
+	return count;
+}
+
+static const struct file_operations fops_debug = {
+	.read = myreader,
+	.write = mywriter,
+	.open = simple_open
+};
+
+static int rpi_axiperf_probe(struct platform_device *pdev)
+{
+	int ret = 0, i;
+	struct device *dev = &pdev->dev;
+	struct device_node *np = dev->of_node;
+	struct device_node *fw_node;
+
+	state = kzalloc(sizeof(struct rpi_axiperf), GFP_KERNEL);
+	if (!state)
+		return -ENOMEM;
+
+	/* Get the firmware handle for future rpi-firmware-xxx calls */
+	fw_node = of_parse_phandle(np, "firmware", 0);
+	if (!fw_node) {
+		dev_err(dev, "Missing firmware node\n");
+		return -ENOENT;
+	}
+
+	state->firmware = rpi_firmware_get(fw_node);
+	if (!state->firmware)
+		return -EPROBE_DEFER;
+
+	/* Special case for the VPU monitor, we must use the mailbox interface
+	 * as it is not accessible from the ARM address space.
+	 */
+	state->monitor[VPU_MONITOR].use_mailbox_interface = 1;
+	state->monitor[SYSTEM_MONITOR].use_mailbox_interface = 0;
+
+	for (i = 0; i < NUM_MONITORS; i++) {
+		if (state->monitor[i].use_mailbox_interface) {
+			 of_property_read_u32_index(np, "reg", i*2,
+				(u32 *)(&state->monitor[i].base_address));
+		} else {
+			struct resource *resource =
+				platform_get_resource(pdev, IORESOURCE_MEM, i);
+
+			state->monitor[i].base_address =
+				devm_ioremap_resource(&pdev->dev, resource);
+		}
+
+		if (IS_ERR(state->monitor[i].base_address))
+			return PTR_ERR(state->monitor[i].base_address);
+
+		/* Enable all buses by default */
+		state->monitor[i].bus_enabled = 0xffff;
+	}
+
+	state->dev = pdev;
+	platform_set_drvdata(pdev, state);
+
+	state->sample_time = DEFAULT_SAMPLE_TIME;
+
+	/* Set up all the debugfs stuff */
+	state->root_folder = debugfs_create_dir(KBUILD_MODNAME, NULL);
+
+	for (i = 0; i < NUM_MONITORS; i++) {
+		state->monitor[i].debugfs_entry =
+			debugfs_create_dir(monitor_name[i], state->root_folder);
+		if (IS_ERR(state->monitor[i].debugfs_entry))
+			state->monitor[i].debugfs_entry = NULL;
+
+		debugfs_create_file("data", 0444,
+				    state->monitor[i].debugfs_entry,
+				    (void *)(uintptr_t)i, &fops_debug);
+		debugfs_create_u32("enable", 0644,
+				   state->monitor[i].debugfs_entry,
+				   &state->monitor[i].bus_enabled);
+		debugfs_create_u32("filter", 0644,
+				   state->monitor[i].debugfs_entry,
+				   &state->monitor[i].bus_filter);
+		debugfs_create_u32("sample_time", 0644,
+				   state->monitor[i].debugfs_entry,
+				   &state->sample_time);
+	}
+
+	mutex_init(&state->lock);
+
+	state->monitor_thread = kthread_run(monitor_thread, state,
+					    "rpi-axiperfmon");
+
+	return ret;
+
+}
+
+static int rpi_axiperf_remove(struct platform_device *dev)
+{
+	int ret = 0;
+
+	kthread_stop(state->monitor_thread);
+
+	debugfs_remove_recursive(state->root_folder);
+	state->root_folder = NULL;
+
+	return ret;
+}
+
+static const struct of_device_id rpi_axiperf_match[] = {
+	{
+		.compatible = "brcm,bcm2835-axiperf",
+	},
+	{},
+};
+MODULE_DEVICE_TABLE(of, rpi_axiperf_match);
+
+static struct platform_driver rpi_axiperf_driver  = {
+	.probe =	rpi_axiperf_probe,
+	.remove =	rpi_axiperf_remove,
+	.driver = {
+		.name   = "rpi-bcm2835-axiperf",
+		.of_match_table = of_match_ptr(rpi_axiperf_match),
+	},
+};
+
+module_platform_driver(rpi_axiperf_driver);
+
+/* Module information */
+MODULE_AUTHOR("James Hughes <james.hughes@raspberrypi.org>");
+MODULE_DESCRIPTION("RPI AXI Performance monitor driver");
+MODULE_LICENSE("GPL");
+