MENU

EB5 PCIe/有线网迁移测试记录(主线 6.19)

March 9, 2026 • Read: 12 • Linux,Qualcomm,Hardware

环境

  • 目标板:QRB5165 EB5
  • 基线:qrb5165-eb5.dts 继承 qrb5165-rb5.dts
  • 目标:恢复 pcie1 下有线网卡 endpoint 枚举

主线 dts

// SPDX-License-Identifier: BSD-3-Clause
/*
 * Qualcomm Robotics EB5 platform based on QRB5165/RB5
 */

/dts-v1/;

#include "qrb5165-rb5.dts"

/ {
    model = "Qualcomm Technologies, Inc. qrb5165 IOT EB5";
    compatible = "qcom,kona-iot", "qcom,kona", "qcom,iot",
             "qcom,eb5", "qcom,qrb5165-rb5", "qcom,sm8250";

    aliases {
        ufshc1 = &ufs_mem_hc;
        pci-domain0 = &pcie0;
        pci-domain1 = &pcie1;
        pci-domain2 = &pcie2;
    };
};

&pcie1 {
    status = "okay";
    max-link-speed = <3>;
    num-lanes = <2>;
    wake-gpios = <&tlmm 84 GPIO_ACTIVE_HIGH>;
    vdda-supply = <&vreg_l9a_1p2>;
    vddpe-3v3-supply = <&vreg_l11c_3p3>;
    /*
     * Remove iommu-map so qcom_pcie_config_sid_1_9_0() returns early
     * and never clears BDF_TO_SID_BYPASS.  With bypass ON (hardware
     * default), inbound PCIe completions skip SMMU BDF-to-SID lookup.
     *
     * Background: during PCI bus walk the SMMU has no context banks for
     * downstream-device SIDs yet (iommu_attach_device() is called only
     * after device_add(), i.e. AFTER the first config read).  The vendor
     * msm_pcie driver never touches BDF_TO_SID_CFG, so bypass stays ON
     * there and ASM2806 completions flow through unblocked.
     */
    /delete-property/ iommu-map;
    /*
     * Append the helper as an extra clock entry so fw_devlink ensures
     * the helper probes (and gpio141 is driven) before qcom-pcie probes.
     * qcom-pcie only requests clocks by the names it knows; "lan-en" is
     * silently ignored by the driver but honored by fw_devlink.
     */
    clocks = <&gcc GCC_PCIE_1_PIPE_CLK>,
         <&gcc GCC_PCIE_1_AUX_CLK>,
         <&gcc GCC_PCIE_1_CFG_AHB_CLK>,
         <&gcc GCC_PCIE_1_MSTR_AXI_CLK>,
         <&gcc GCC_PCIE_1_SLV_AXI_CLK>,
         <&gcc GCC_PCIE_1_SLV_Q2A_AXI_CLK>,
         <&gcc GCC_PCIE_WIGIG_CLKREF_EN>,
         <&gcc GCC_AGGRE_NOC_PCIE_TBU_CLK>,
         <&gcc GCC_DDRSS_PCIE_SF_TBU_CLK>,
         <&eb5_pcie1_helper>;
    clock-names = "pipe", "aux", "cfg", "bus_master", "bus_slave",
              "slave_q2a", "ref", "tbu", "ddrss_sf_tbu", "lan-en";
    pinctrl-0 = <&pcie1_default_state>;

    pcie@0 {
    };
};

/*
 * Lightweight anchor node for the out-of-tree eb5-pcie1-helper kmod.
 * The driver waits for RC1 link stabilisation and then rescans pcie1 (domain 1)
 * so the ASM2806 cascade + RTL8168 endpoints can be discovered without any
 * modifications to qcom.c.
 */
/ {
    eb5_pcie1_helper: eb5-pcie1-helper {
        compatible = "qcom,eb5-pcie1-helper";
        lan-en-gpios = <&tlmm 141 GPIO_ACTIVE_HIGH>;
        #clock-cells = <0>;
        status = "okay";
    };
};

&pcie2 {
    status = "disabled";
};

&pcie2_phy {
    status = "disabled";
};

&tlmm {
};

EB5 Helper Module

// SPDX-License-Identifier: GPL-2.0
/*
 * pcie-qcom-eb5-helper.c - PCIe1 ASM2806 bridge bring-up helper for QRB5165 EB5
 *
 * The Qualcomm vendor 4.19 kernel has a private "use-pcie-bridge-asm2806" DT
 * property that drives gpio141 (ASM2806 bridge-enable) before the RC1 PCIe
 * controller enumerates the bus.  Mainline qcom-pcie has no such logic.
 *
 * This driver:
 *   1. Acquires gpio141 and drives it HIGH on probe (before qcom-pcie touches
 *      PERST#).
 *   2. Registers itself as a dummy fixed-rate clock provider (#clock-cells = <0>),
 *      which qrb5165-eb5.dts adds to pcie1's clock list.  fw_devlink sees this
 *      phandle and guarantees that pcie1 will NOT be probed until this driver's
 *      probe() returns successfully.
 *   3. Schedules a deferred rescan after the ASM2806 cascade downstream links
 *      finish training.  The rescan explicitly programs bridge MEMORY_BASE/LIMIT
 *      registers to hardware — pci_assign_unassigned_bus_resources() only updates
 *      kernel data structures but does NOT write the bridge window registers,
 *      which would leave the RTL8168 endpoints inaccessible.
 *
 * Result: gpio141 is driven high BEFORE qcom_pcie_host_init() de-asserts PERST#,
 * so the ASM2806 bridge is powered and ready for config-space enumeration.
 *
 * Nothing in qcom.c / pcie-qcom.c is modified.
 */

#include <linux/module.h>
#include <linux/platform_device.h>
#include <linux/of.h>
#include <linux/delay.h>
#include <linux/gpio/consumer.h>
#include <linux/clk-provider.h>
#include <linux/clk.h>
#include <linux/pci.h>
#include <linux/workqueue.h>
#include <linux/io.h>

/* PCI domain 1, root bus 0 — matches pcie1 (1c08000.pcie) */
#define EB5_PCIE1_DOMAIN    1
#define EB5_PCIE1_ROOT_BUS    0

/*
 * Poll interval while waiting for pcie1 to create its root bus.
 * pcie1 probe can take 10-15 s at boot; we retry up to 30 times (30 s).
 */
#define EB5_POLL_INTERVAL_MS    1000
#define EB5_POLL_MAX_RETRIES    30

#define DRV_NAME "qcom-eb5-pcie1-helper"

struct eb5_pcie_helper {
    struct gpio_desc    *lan_en_gpio;
    struct clk_hw        clk_hw;
    struct delayed_work  rescan_work;
};

static const struct clk_ops eb5_lan_clk_ops = { /* no-op clock, ordering only */ };

/*
 * program_bridge_windows - write bridge memory windows to hardware registers.
 *
 * pci_assign_unassigned_bus_resources() assigns memory windows in the kernel's
 * resource tree but does NOT write PCI_MEMORY_BASE / PCI_MEMORY_LIMIT to the
 * bridge's config space.  Without this step the bridge does not forward memory
 * transactions downstream, so endpoint drivers fail on their very first MMIO
 * access.  Recurse into child buses so all levels of ASM2806 are programmed.
 */
static void program_bridge_windows(struct pci_bus *bus)
{
    struct pci_dev *dev;

    list_for_each_entry(dev, &bus->devices, bus_list) {
        struct resource *res;
        u16 cmd;

        if (!dev->subordinate)
            continue;

        /* Write 32-bit non-prefetchable memory window */
        res = &dev->resource[PCI_BRIDGE_MEM_WINDOW];
        if (resource_size(res) > 0) {
            u16 mem_base  = (res->start >> 16) & 0xfff0;
            u16 mem_limit = (res->end   >> 16) & 0xfff0;

            pci_write_config_word(dev, PCI_MEMORY_BASE,  mem_base);
            pci_write_config_word(dev, PCI_MEMORY_LIMIT, mem_limit);
            dev_info(&dev->dev, "bridge mem window programmed: %pR\n", res);
        }

        /* Write I/O window if present */
        res = &dev->resource[PCI_BRIDGE_IO_WINDOW];
        if (resource_size(res) > 0) {
            pci_write_config_byte(dev, PCI_IO_BASE,
                          (res->start >> 8) & 0xf0);
            pci_write_config_byte(dev, PCI_IO_LIMIT,
                          (res->end   >> 8) & 0xf0);
        }

        /* Enable bus-mastering and memory-space decoding on the bridge */
        pci_read_config_word(dev, PCI_COMMAND, &cmd);
        cmd |= PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER;
        pci_write_config_word(dev, PCI_COMMAND, cmd);

        program_bridge_windows(dev->subordinate);
    }
}

static void eb5_pcie1_rescan_work(struct work_struct *work)
{
    struct eb5_pcie_helper *h =
        container_of(work, struct eb5_pcie_helper, rescan_work.work);
    struct pci_bus *root_bus;
    int retries = 0;

    /* Re-assert gpio141 in case it was reset during suspend/resume */
    gpiod_set_value_cansleep(h->lan_en_gpio, 1);

    /*
     * pcie1 probe (and bus creation) can take 10-15 s at boot — much
     * longer than the helper's own probe.  Poll until the root bus
     * appears, then give the ASM2806 downstream links an extra second
     * to finish training before we scan.
     */
    while (retries < EB5_POLL_MAX_RETRIES) {
        root_bus = pci_find_bus(EB5_PCIE1_DOMAIN, EB5_PCIE1_ROOT_BUS);
        if (root_bus)
            break;
        pr_debug(DRV_NAME ": waiting for pcie1 bus (attempt %d/%d)\n",
             retries + 1, EB5_POLL_MAX_RETRIES);
        msleep(EB5_POLL_INTERVAL_MS);
        retries++;
    }

    if (!root_bus) {
        pr_err(DRV_NAME ": domain %u bus %02x not found after %d s, giving up\n",
               EB5_PCIE1_DOMAIN, EB5_PCIE1_ROOT_BUS, EB5_POLL_MAX_RETRIES);
        return;
    }

    /* Extra settling time for ASM2806 downstream link training */
    msleep(1000);

    pr_info(DRV_NAME ": rescanning pcie1 (domain %u bus %02x) after %d poll(s)\n",
        EB5_PCIE1_DOMAIN, EB5_PCIE1_ROOT_BUS, retries);

    /*
     * Issue a Secondary Bus Reset (SBR) via the root port's Bridge Control
     * register.  This pulses the downstream PERST# from the RC side without
     * needing direct GPIO access.  The ASM2806 may have been unresponsive
     * during the initial enumeration because its internal init was not yet
     * complete when qcom-pcie first de-asserted PERST# at probe time.
     * Driving SBR here — after gpio141 has been high for several seconds —
     * gives the ASM2806 a clean reset cycle with power already stable.
     */
    {
        struct pci_dev *rp = pci_get_domain_bus_and_slot(
                    EB5_PCIE1_DOMAIN, 0, PCI_DEVFN(0, 0));
        if (rp) {
            u16 bctl;

            pci_read_config_word(rp, PCI_BRIDGE_CONTROL, &bctl);
            /* Assert Secondary Bus Reset */
            pci_write_config_word(rp, PCI_BRIDGE_CONTROL,
                          bctl | PCI_BRIDGE_CTL_BUS_RESET);
            msleep(100); /* hold reset ≥ 100 ms (PCIe r3.0 §6.6.1) */
            /* De-assert Secondary Bus Reset */
            pci_write_config_word(rp, PCI_BRIDGE_CONTROL, bctl);
            msleep(500); /* wait for ASM2806 to finish link re-training */
            dev_info(&rp->dev,
                 DRV_NAME ": SBR pulse done, waiting for ASM2806\n");
            pci_dev_put(rp);
        } else {
            pr_warn(DRV_NAME ": root port not found, skipping SBR\n");
        }
    }

    pci_lock_rescan_remove();

    /* Step 1: discover new devices (ASM2806 cascade + RTL8168) */
    pci_scan_child_bus(root_bus);

    /* Step 2: assign BARs and bridge windows in kernel resource structs */
    pci_assign_unassigned_bus_resources(root_bus);

    /*
     * Step 3: write bridge MEMORY_BASE/LIMIT to hardware config space.
     * This is the step that pci_rescan_bus() / pci_assign_…() omit,
     * and without it the RTL8168 endpoints are unreachable via MMIO.
     */
    program_bridge_windows(root_bus);

    /* Step 4: add devices to driver model — triggers driver probes */
    pci_bus_add_devices(root_bus);

    pci_unlock_rescan_remove();

    pr_info(DRV_NAME ": rescan complete, bridges programmed\n");
}

/* Forward declaration for sysfs attribute */
static DEVICE_ATTR_WO(reset_asm2806);

static int eb5_pcie_helper_probe(struct platform_device *pwd)
{
    struct device *dev = &pwd->dev;
    struct eb5_pcie_helper *h;
    struct clk_init_data init = {};
    int ret;

    h = devm_kzalloc(dev, sizeof(*h), GFP_KERNEL);
    if (!h)
        return -ENOMEM;

    /*
     * Acquire gpio141 (ASM2806 bridge-enable) and drive it HIGH.
     * This must happen before qcom-pcie de-asserts PERST# on RC1.
     * The clock provider registration below ensures that qcom-pcie
     * does not even begin probing until after this point.
     */
    h->lan_en_gpio = devm_gpiod_get(dev, "lan-en", GPIOD_OUT_HIGH);
    if (IS_ERR(h->lan_en_gpio))
        return dev_err_probe(dev, PTR_ERR(h->lan_en_gpio),
                 "failed to get lan-en gpio\n");

    msleep(50); /* allow ASM2806 to become ready */
    dev_info(dev, "lan-en (gpio141) asserted high\n");

    /*
     * Register a zero-rate fixed clock so that fw_devlink can enforce
     * the probe ordering: pcie1 (which lists us in its 'clocks') will
     * not probe before of_clk_add_hw_provider() returns.
     */
    init.name = DRV_NAME;
    init.ops  = &eb5_lan_clk_ops;
    h->clk_hw.init = &init;

    ret = devm_clk_hw_register(dev, &h->clk_hw);
    if (ret)
        return dev_err_probe(dev, ret, "clk_hw_register failed\n");

    ret = devm_of_clk_add_hw_provider(dev, of_clk_hw_simple_get, &h->clk_hw);
    if (ret)
        return dev_err_probe(dev, ret,
                 "of_clk_add_hw_provider failed\n");

    /* Start polling immediately; the work itself waits for the bus */
    INIT_DELAYED_WORK(&h->rescan_work, eb5_pcie1_rescan_work);
    schedule_delayed_work(&h->rescan_work, 0);

    platform_set_drvdata(pwd, h);
    ret = device_create_file(dev, &dev_attr_reset_asm2806.attr);
    if (ret)
        dev_warn(dev, "failed to create reset_asm2806 sysfs: %d\n", ret);

    dev_info(dev, "clock provider registered, waiting for pcie1 bus\n");
    return 0;
}

static void eb5_pcie_helper_remove(struct platform_device *pdev)
{
    struct eb5_pcie_helper *h = platform_get_drvdata(pdev);

    cancel_delayed_work_sync(&h->rescan_work);
    device_remove_file(&pwd->dev, &dev_attr_reset_asm2806.attr);
}

static const struct of_device_id eb5_pcie_helper_of_match[] = {
    { .compatible = "qcom,eb5-pcie1-helper" },
    {}
};
MODULE_DEVICE_TABLE(of, eb5_pcie_helper_of_match);

static struct platform_driver eb5_pcie_helper_driver = {
    .probe          = eb5_pcie_helper_probe,
    .remove         = eb5_pcie_helper_remove,
    .driver = {
        .name           = DRV_NAME,
        .of_match_table = eb5_pcie_helper_of_match,
    },
};
builtin_platform_driver(eb5_pcie_helper_driver);

/* Sysfs interface to trigger full reset sequence */
static ssize_t reset_asm2806_store(struct device *dev,
                    struct device_attribute *attr,
                    const char *buf, size_t count)
{
    struct eb5_pcie_helper *h = dev_get_drvdata(dev);
    void __iomem *parf;
    struct pci_bus *root_bus;

    dev_info(dev, "triggering full ASM2806 reset sequence...\n");

    /* 1. Power cycle gpio141 */
    gpiod_set_value_cansleep(h->lan_en_gpio, 0);
    msleep(200);
    gpiod_set_value_cansleep(h->lan_en_gpio, 1);
    msleep(500);

    /* 2. Write bypass bit */
    parf = ioremap(0x1c0ac00, 4);
    if (parf) {
        writel(BIT(0), parf);
        dev_info(dev, "BDF_TO_SID_BYPASS set\n");
        iounmap(parf);
    }

    /* 3. Trigger SBR */
    root_bus = pci_find_bus(EB5_PCIE1_DOMAIN, EB5_PCIE1_ROOT_BUS);
    if (root_bus) {
        struct pci_dev *rp = pci_get_domain_bus_and_slot(
            EB5_PCIE1_DOMAIN, 0, PCI_DEVFN(0, 0));
        if (rp) {
            u16 bctl;
            pci_read_config_word(rp, PCI_BRIDGE_CONTROL, &bctl);
            pci_write_config_word(rp, PCI_BRIDGE_CONTROL,
                          bctl | PCI_BRIDGE_CTL_BUS_RESET);
            msleep(100);
            pci_write_config_word(rp, PCI_BRIDGE_CONTROL, bctl);
            msleep(500);
            pci_dev_put(rp);
            dev_info(dev, "SBR done\n");
        }
    }

    /* 4. Rescan */
    pci_lock_rescan_remove();
    pci_scan_child_bus(root_bus);
    pci_bus_add_devices(root_bus);
    pci_unlock_rescan_remove();

    dev_info(dev, "reset sequence complete\n");
    return count;
}

已执行测试(按大类)

1) DTS 基线与 include 链确认

  • 确认 qrb5165-eb5.dts -> qrb5165-rb5.dts -> sm8250.dtsi 链路正确。
  • 校对 pcie1 主线默认脚位语义:perst=82wake=84

2) pcie1 链路参数测试

  • &pcie1 设置:max-link-speed = <1>(降速到 Gen1)。
  • &pcie1 设置:num-lanes = <1>(单 lane 试验)。
  • 结果:0001 域仍仅有 root port,无桥后 endpoint。

3) IOMMU 映射扩展测试

  • iommu-map 从基础项扩展到多 BDF(覆盖 0x0..0x600 相关条目)。
  • 结果:未触发有线 endpoint 出现。

4) 供电影响排查

  • 增加/确认 vdda-supplyvddpe-3v3-supply
  • 结果:无显著变化,pcie1 仍空下游。

5) WAKE/RESET 脚位组合测试

  • wake-gpios 做过候选切换(22/84),最终回归 84(与主线/旧树一致)。
  • pcie@0/reset-gpios 做过极性 A/B 试验(active low/high 组合对比)。
  • 结果:仍未枚举到有线 endpoint。

6) 141 板级控制线测试

  • 早期方案:gpio141 作为 gpio-hog output-high(常高驱动)。
  • 最新方案:移除 hog,改为 pcie1 附加 pinctrl(pcie1_lan1_wake_defaultbias-pull-up 输入态),贴近旧树语义。
  • 结果:等待该版本实机回传确认。

7) pcie2 冲突隔离测试

  • &pcie2&pcie2_phy 设为 disabled
  • 结果:未改变 pcie1 无下游端点现象。

8) GPIO 共享告警处置

  • 观察到告警:gpio_shared_add_proxy_lookup / qcom_pcie_probe
  • 做过修复:删除 &pcie1perst-gpios,保留 pcie@0/reset-gpios,避免同线重复申请。
  • 新日志定位:告警主要落在 1c00000.pcie (pcie0),非 1c08000.pcie (pcie1)

9) 旧内核基线确认(4.19)

  • 在旧内核 4.19.125-perf-v2 下,0001 域可完整枚举:

    • 0001:01:00.0/0001:02:00.0/0001:02:06.0/0001:02:0e.0 均为 ASMedia 1b21:2806
    • 0001:04:00.00001:05:00.0RTL8111/8168 (10ec:8168)
  • 旧日志明确出现 RC1: asm2806(1),说明厂商侧存在 RC1 私有桥控制语义。

10) 主线运行时补偿脚本尝试(失败)

  • 在主线 6.19.0-dirty 运行 pcie1-kick.sh

    • gpioset gpiochip0 141=... 返回 Invalid argument
    • /sys/bus/platform/drivers/qcom-pcie/{bind,unbind} 不存在或不可用,无法运行时重绑 RC1
  • 结论:该平台当前不支持“运行时重绑+踢脚”路径,需回到“重启生效”的 DTS 单变量法。

11) 纯 DTS 单变量回归(最新)

  • 已将 gpio141 改回 gpio-hog output-high,并移除 pcie1pcie1_lan1_wake_default 引用。
  • 该版本仅改 gpio141 启动策略,便于与上一版直接对比。

12) 纯 DTS 单变量(本轮)

  • gpio141gpio-hog 改为 pcie1 设备级 pinctrl(pcie1_lan1_en_default),配置为 output-high
  • &pcie1pinctrl-0 调整为 <&pcie1_default_state>, <&pcie1_lan1_en_default>
  • 保持 wake-gpios=84reset-gpios=82qcom.c 不变,确保本轮仅变更 141 的所有权与生效时机。
  • 结果:等待该版本实机回传确认。

当前观测结论(最新)

  • 主线 6.19 lspci 稳定仅有 0001:00:00.0(root port),ASM2806 / RTL8168 不可见
  • vendor 4.19 完整枚举:ASM2806 × 4 + RTL8168 × 2,eth1 UP
  • 链路速度不是根因:Gen1 x1 下 config TLP 依然可以工作,vendor 4.19 跑 Gen3 x2 但根因另在
  • 已确认根因:主线 iommu-map 触发 BDF_TO_SID_BYPASS 被清除 → SMMU fault → 0xFFFF(见下方根因分析节)
  • 当前最新镜像(step 22)已将 iommu-map 删除,待实机验证

13) built-in helper driver + gpiod 主动驱动 gpio141(本轮)

背景:步骤 12 中 rescan 已成功执行(dmesg 显示 "PCI 1:00 rescan complete"),但 ASM2806 config space 返回全 0xFF,说明 gpio141 未被物理驱动。Qualcomm TLMM 的 pinctrl output-high 仅设置 mux,开机后不保证实际电平被 GPIO 控制器持续驱动。

变更内容(单变量):

  1. DTSeb5-pcie1-helper 节点加入 lan-en-gpios = <&tlmm 141 GPIO_ACTIVE_HIGH>
  2. DTS — 从 &pcie1 pinctrl-0 移除 pcie1_lan1_en_default;从 &tlmm 删除该 pinctrl block(gpio141 所有权唯一归 helper driver)
  3. 驱动pcie-qcom-eb5-helper.c:struct 加 struct gpio_desc *lan_en_gpio
  4. 驱动 — probe 时 devm_gpiod_get_optional(dev, "lan-en", GPIOD_OUT_HIGH) 请求并立即驱动 gpio141 为高
  5. 驱动 — worker 在 pci_find_bus() 前再次 gpiod_set_value_cansleep(h->lan_en_gpio, 1) + msleep(50) 确保 ASM2806 就绪

预期验证口径(刷机后):

dmesg | grep -E 'eb5-pcie1|lan-en|gpio141|rescan|0001:'
lspci -nn | grep '^0001:'
ip -br a

若仍无响应,追加:

lspci -vv -s 0001:00:00.0 | grep -E 'LnkSta|Speed|Width'

14) genpd provider 方案 — 让 pcie1 依赖 helper(本轮)

方案来源:建议写个像 power-domain 的驱动。

原理:将 helper 改为 generic_pm_domain (genpd) provider。&pcie1power-domains 列表加入 <&eb5_pcie1_helper>,内核 PM domain 框架会在 qcom-pcie probe 之前自动调用 helper 的 power_on 回调,保证 gpio141 在 RC1 第一次枚举时已经 asserted。彻底不需要 rescan、delayed_work 和时序猜测。

变更内容(单变量):

  1. 驱动重构 — 删除所有 pci.h / workqueue.h / delayed_work / pci_scan_child_bus 代码

    • 新结构体:struct eb5_lan_pd { struct generic_pm_domain genpd; struct gpio_desc *lan_en_gpio; }
    • probedevm_gpiod_get(GPIOD_OUT_HIGH) + msleep(50) + pm_genpd_init(is_off=false) + of_genpd_add_provider_simple()
    • removeof_genpd_del_provider() + pm_genpd_remove()
    • power_on 回调:gpiod_set_value_cansleep(1) + msleep(50)(供 suspend/resume 使用)
    • GENPD_FLAG_ALWAYS_ON:运行时不切断电源
  2. DTS — helper 节点加 #power-domain-cells = <0>; 并加 label eb5_pcie1_helper:
  3. DTS&pcie1 覆盖 power-domains = <&gcc PCIE_1_GDSC>, <&eb5_pcie1_helper>;power-domain-names = "gdsc", "lan-en";

时序保证

  • fw_devlink 使 pcie1 不会在 helper 注册 genpd provider 之前 probe
  • genpd 框架在 pcie1 上电时调用 power_on,gpio141 在 qcom_pcie_probe→link_up→enumeration 全链路之前已高

预期验证口径(刷机后):

dmesg | grep -E 'eb5-pcie1|lan-en|pcie1.*power|0001:'
lspci -nn | grep '^0001:'
ip -br a

15) 时钟 provider 排序方案(本轮)

根因分析(步骤 14 失败原因)

  • 步骤 14 添加 power-domains = <&gcc PCIE_1_GDSC>, <&eb5_pcie1_helper> 后,fw_devlink 和 genpd 两套机制各自为 pcie1↔helper 创建了一条 device_link。双 device_link 冲突导致 pci_register_host_bridge → request_resource 返回 -16 (EBUSY),pcie1 probe 失败。

当前方案(朋友思路的正确实现):

  • 不触碰 power-domains,改用 clock provider 做排序锚点:

    1. helper 注册一个 zero-rate dummy 时钟(#clock-cells = <0>
    2. qrb5165-eb5.dts 在 pcie1 的 clocks/clock-names 末尾加 <&eb5_pcie1_helper> "lan-en"
    3. fw_devlink 识别 clocks phandle,自动保证 pcie1 probe 在 helper probe 完成之后
    4. helper probe 时驱动 gpio141 high + msleep(50)of_clk_add_hw_provider 完成
    5. qcom-pcie 只 devm_clk_bulk_get 它知道的 9 个名字,"lan-en" 被忽略,无副作用

时序

  • T=1.2s: helper probe → gpio141 HIGH + clock provider 注册 → fw_devlink 放行 pcie1
  • T=6.4s: pcie1 probe → qcom_pcie_host_init → PERST# de-assert(gpio141 已 high ≥ 5s)

变更内容

  1. 驱动重构 — 移除所有 genpd/pm_domain 代码;改用 clk_init_data + devm_clk_hw_register + devm_of_clk_add_hw_provider
  2. DTS — helper 节点 #power-domain-cells#clock-cells = <0>
  3. DTS — pcie1 恢复原始 power-domains(不再覆盖),添加完整 clocks/clock-names 列表并追加 <&eb5_pcie1_helper> "lan-en"

预期 dmesg 口径

dmesg | grep -E 'eb5-pcie1|lan-en|1c08000.pcie|0001:'
lspci -nn | grep '^0001:'
ip -br a

应看到:

  1. eb5-pcie1-helper: lan-en (gpio141) asserted high @ ~1.2s
  2. eb5-pcie1-helper: clock provider registered... @ ~1.2s
  3. qcom-pcie 1c08000.pcie: PCI host bridge to bus 0001:00 @ ~6.x s(probe 不再 EBUSY)
  4. 0001:01:00.0 ... 1b21:2806(ASM2806)+ 0001:04:00.0 10ec:8168(RTL8168)

21) 修正为 Gen3 x2,重建镜像

变更(单变量):

  • qrb5165-eb5.dts&pcie1max-link-speed = <1><3>num-lanes = <1><2>
  • 其余不变(clock provider、gpio141、SBR 等保留)

实测结果(已刷机)

  • LnkSta 仍为 0x1011(Gen1 x1),即使写了 max-link-speed=3 也无法训练到 Gen3
  • 手动写 LnkCtl2 TLS=Gen3(setpci ... CAP_EXP+0x30.w=0x0003)再触发 retrain → 仍 Gen1 x1
  • PHY 寄存器层面 sm8250_qmp_gen3x2_pciephy_cfg 在主线确实存在(lanes=2,tbls_rc Gen3x2 表完整),但实际无法跑到 Gen3
  • ASM2806 config space 仍返回 0xffffffff
  • 结论:Gen1 x1 ≠ 根因。速度与 config TLP 可见性正交,Gen1 链路也应能读 config space

关键反转:步骤 21 之前的假设("Gen1 导致 ASM2806 不可见")已被证伪。继续在旧内核 4.19 上用 pci-msm debugfs 深挖寄存器差异。

22) 修复:/delete-property/ iommu-map(当前待验证)

变更(单变量,仅改 DTS):

  • qrb5165-eb5.dts&pcie1 中,将多条 iommu-map 条目替换为 /delete-property/ iommu-map;
  • 这使 qcom_pcie_config_sid_1_9_0()size=0 提前返回,不清除 BDF_TO_SID_BYPASS bit
  • 其余不变(max-link-speed=3, num-lanes=2, clock provider, gpio141 均保留)

镜像哈希53d1472f5be3b1d28131ff79752dce537176661193bf33100089ea4e1bcd6066

验证口径(刷机后):

dmesg | grep -E '0001:|1c08000|eb5-pcie1'
lspci -nn | grep '^0001:'   # 期望:1b21:2806 + 10ec:8168
ip -br a                    # 期望:eth0/eth1 UP
# 若 ASM2806 出现但 eth DOWN,追查:
dmesg | grep -E 'smmu.*fault|iommu.*error|r8169|rtl8168'

预期原理:bypass=1 → SMMU 透传所有 PCIe CplD → ASM2806 config read 正常返回 0x1b212806


旧内核(4.19)pci-msm debugfs 深度探查

22-vendor) 确认旧内核链路和设备状态

板上执行(vendor 4.19):

ip -br a
lo               UNKNOWN  127.0.0.1/8 ::1/128
bond0            DOWN
dummy0           UNKNOWN  fe80::d3f:ae5d:b7ab:4c76/64
eth1             UP       192.168.1.185/24 ...   ← NIC working ✅
tailscale0       UNKNOWN  100.64.0.70/32 ...
wlan0            UP       192.168.6.1/24 ...

lspci -nn | grep '^0001:'
0001:00:00.0 PCI bridge [0604]: Qualcomm Device [17cb:010b]
0001:01:00.0 PCI bridge [0604]: ASMedia Technology Inc. Device [1b21:2806] (rev 01) ✅
0001:02:00.0 PCI bridge [0604]: ASMedia Technology Inc. Device [1b21:2806] (rev 01) ✅
0001:02:06.0 PCI bridge [0604]: ASMedia Technology Inc. Device [1b21:2806] (rev 01) ✅
0001:02:0e.0 PCI bridge [0604]: ASMedia Technology Inc. Device [1b21:2806] (rev 01) ✅
0001:04:00.0 Ethernet controller [0200]: Realtek ... RTL8111/8168 [10ec:8168] (rev 15) ✅
0001:05:00.0 Ethernet controller [0200]: Realtek ... RTL8111/8168 [10ec:8168] (rev 15) ✅

23-vendor) pci-msm debugfs case 9-32 探查结果

执行(vendor 4.19,rc_sel=2):

echo 2 > /sys/kernel/debug/pci-msm/rc_sel
for i in 9 10 11 12 13 14 15 16 20 21 22 32; do
    echo "=== case $i ==="; echo $i > case; dmesg | tail -5; sleep 0.3
done

结果摘要

case含义输出关键内容
9disable L1ssPCIe: RC1: disable L1ss
10enable L1ssPCIe: RC1: enable L1ss
11enumerate RC1PCIe: RC1 is already enumerated(不重枚举)
12read PARF registerbase: parf: 0xffffff80089d8000wr_offset: 0x2c0value: 0x0
13write PARF registerparf+0x2c0 = 0x0(vendor 的 wr_offset=0x2c0 对应 PARF 内一个不同寄存器,非主线 PARF_BDF_TO_SID_CFG=0x2c00
14write PARF register同 case 13(延续 wr_offset/mask/value 设置)
15map LBAR 4K DDR映射 DDR + LBAR=0x40008000 到内核虚地址 0xffffff8010e4d000
16unmap LBAR释放 case 15 的映射
20Read DDR valuesDDR is not mapped(需先做 case 15)
21Read LBAR valuesLBAR address is not mapped(需先做 case 15)
22Write 0x1 to DDRDDR address is not mapped
32set target speed Gen3PCIe: RC1: set target speed to Gen 3

重点

  • case=12 读 parf+0x2c0 值为 0x0 → vendor 4.19 该偏移为零,不对应主线的 PARF_BDF_TO_SID_CFG(主线定义为 0x2c00,差了一个零)
  • case=15 LBAR=0x40008000(vendor driver 的 debug ioremap 窗口,非 mainline ATU 0x40001000
  • gpio debugfs 无输出cat /sys/kernel/debug/gpio | grep -E ' 82 | 141 ' → 空,说明 vendor 4.19 gpio 82/141 未以 debugfs 方式暴露(可能走的是 msm_pcie 私有接口)

24-vendor) pci-msm case=4 shadow dump 解析

执行(之前已获取):

echo 2 > /sys/kernel/debug/pci-msm/rc_sel
echo 4 > /sys/kernel/debug/pci-msm/case

输出(RTL8168 at 05:00.0 的 shadow registers):

shadow_dw[4]:cfg  0x10: 0x40202001   # BAR0 low (64-bit prefetchable, addr=0x40202000)
shadow_dw[6]:cfg  0x18: 0x40404004   # Type-1 bridge bus numbers
shadow_dw[8]:cfg  0x20: 0x40400004   # Memory Base/Limit
shadow_dw[20]:cfg 0x50: 0x817005     # PCIe capability
shadow_dw[30]:cfg 0x78: 0x10501f     # Link cap: Gen3 x2
shadow_dw[32]:cfg 0x80: 0x10110002   # LnkSta2/LnkCtl2
shadow_dw[44]:cfg 0xb0: 0x30011      # L1 sub-state cap

→ vendor RTL8168 BAR 被正确分配在 0x40202000,内存窗口 0x40400000 区域正常。


根因分析(最终结论)

代码路径溯源

  1. 主线 ops_1_9_0pcie-qcom.c)对应 qcom,pcie-sm8250,其 .config_sid = qcom_pcie_config_sid_1_9_0
  2. 该函数逻辑:

    of_get_property(dev->of_node, "iommu-map", &size);
    if (!size)
        return 0;           // ← 有 iommu-map 才往下走
    val = readl(pcie->parf + PARF_BDF_TO_SID_CFG);  // PARF_BDF_TO_SID_CFG = 0x2c00
    val &= ~BDF_TO_SID_BYPASS;   // ← 清除 bypass bit(BIT(0))
    writel(val, pcie->parf + PARF_BDF_TO_SID_CFG);
    // 然后填充 BDF-to-SID 哈希表...
  3. eb5.dts 中 &pcie1 有多条 iommu-map(BDF 0x0000→0x0600),导致 config_sid 被调用
  4. bypass bit(BIT(0))被清除 → SMMU 对所有下游设备的 CplD 执行 BDF→SID 查表
  5. PCI 枚举阶段,iommu_attach_device() 尚未被调用(它发生在 device_add() 之后,即第一次 config read 成功之后)
  6. SMMU 查 SID 0x1c81(ASM2806,BDF=0x0100)→ 无 context bank → fault → Cpl 被丢弃 → CPU 读到 0xffffffff

vendor 4.19 为何正常

  • out-of-tree msm_pcie 驱动从不调用任何等价 config_sid 逻辑
  • BDF_TO_SID_BYPASS(硬件上电默认值 = 1,即 bypass 开启)始终保持
  • 所有 PCIe completions 绕过 SMMU SID 查表,直达 CPU → ASM2806 config read 正常返回

对比表

项目vendor 4.19 (msm_pcie)主线 6.19 (pcie-qcom.c)
驱动out-of-tree,特有 RC 初始化主线,通用 DWC ops_1_9_0
config_sid 调用从不调用iommu-map 非空时调用
BDF_TO_SID_BYPASS= 1(硬件默认,从未清除)= 0(被主动清除)
ASM2806 CplD 路径绕过 SMMU → CPU ✅SMMU fault → 丢包 ❌
LnkStaGen3 x2Gen1 x1(PHY 实际不支持 Gen3)