MENU

EB5 PCIe/有线网迁移测试记录(主线 6.19)

March 9, 2026 • Read: 265 • Linux,Qualcomm,Hardware

环境

  • 目标板:QRB5165 EB5
  • 基线:qrb5165-eb5.dts 继承 qrb5165-rb5.dts
  • 目标:恢复 pcie1 下有线网卡 endpoint 枚举

主线 dts

// SPDX-License-Identifier: BSD-3-Clause
/*
 * Qualcomm Robotics EB5 platform based on QRB5165/RB5
 */

/dts-v1/;

#include "qrb5165-rb5.dts"

/ {
    model = "Qualcomm Technologies, Inc. qrb5165 IOT EB5";
    compatible = "qcom,kona-iot", "qcom,kona", "qcom,iot",
             "qcom,eb5", "qcom,qrb5165-rb5", "qcom,sm8250";

    aliases {
        ufshc1 = &ufs_mem_hc;
        pci-domain0 = &pcie0;
        pci-domain1 = &pcie1;
        pci-domain2 = &pcie2;
    };
};

&pcie1 {
    status = "okay";
    max-link-speed = <3>;
    num-lanes = <2>;
    wake-gpios = <&tlmm 84 GPIO_ACTIVE_HIGH>;
    vdda-supply = <&vreg_l9a_1p2>;
    vddpe-3v3-supply = <&vreg_l11c_3p3>;
    /*
     * Remove iommu-map so qcom_pcie_config_sid_1_9_0() returns early
     * and never clears BDF_TO_SID_BYPASS.  With bypass ON (hardware
     * default), inbound PCIe completions skip SMMU BDF-to-SID lookup.
     *
     * Background: during PCI bus walk the SMMU has no context banks for
     * downstream-device SIDs yet (iommu_attach_device() is called only
     * after device_add(), i.e. AFTER the first config read).  The vendor
     * msm_pcie driver never touches BDF_TO_SID_CFG, so bypass stays ON
     * there and ASM2806 completions flow through unblocked.
     */
    /delete-property/ iommu-map;
    /*
     * Append the helper as an extra clock entry so fw_devlink ensures
     * the helper probes (and gpio141 is driven) before qcom-pcie probes.
     * qcom-pcie only requests clocks by the names it knows; "lan-en" is
     * silently ignored by the driver but honored by fw_devlink.
     */
    clocks = <&gcc GCC_PCIE_1_PIPE_CLK>,
         <&gcc GCC_PCIE_1_AUX_CLK>,
         <&gcc GCC_PCIE_1_CFG_AHB_CLK>,
         <&gcc GCC_PCIE_1_MSTR_AXI_CLK>,
         <&gcc GCC_PCIE_1_SLV_AXI_CLK>,
         <&gcc GCC_PCIE_1_SLV_Q2A_AXI_CLK>,
         <&gcc GCC_PCIE_WIGIG_CLKREF_EN>,
         <&gcc GCC_AGGRE_NOC_PCIE_TBU_CLK>,
         <&gcc GCC_DDRSS_PCIE_SF_TBU_CLK>,
         <&eb5_pcie1_helper>;
    clock-names = "pipe", "aux", "cfg", "bus_master", "bus_slave",
              "slave_q2a", "ref", "tbu", "ddrss_sf_tbu", "lan-en";
    pinctrl-0 = <&pcie1_default_state>;

    pcie@0 {
    };
};

/*
 * Lightweight anchor node for the out-of-tree eb5-pcie1-helper kmod.
 * The driver waits for RC1 link stabilisation and then rescans pcie1 (domain 1)
 * so the ASM2806 cascade + RTL8168 endpoints can be discovered without any
 * modifications to qcom.c.
 */
/ {
    eb5_pcie1_helper: eb5-pcie1-helper {
        compatible = "qcom,eb5-pcie1-helper";
        lan-en-gpios = <&tlmm 141 GPIO_ACTIVE_HIGH>;
        #clock-cells = <0>;
        status = "okay";
    };
};

&pcie2 {
    status = "disabled";
};

&pcie2_phy {
    status = "disabled";
};

&tlmm {
};

EB5 Helper Module

// SPDX-License-Identifier: GPL-2.0
/*
 * pcie-qcom-eb5-helper.c - PCIe1 ASM2806 bridge bring-up helper for QRB5165 EB5
 *
 * The Qualcomm vendor 4.19 kernel has a private "use-pcie-bridge-asm2806" DT
 * property that drives gpio141 (ASM2806 bridge-enable) before the RC1 PCIe
 * controller enumerates the bus.  Mainline qcom-pcie has no such logic.
 *
 * This driver:
 *   1. Acquires gpio141 and drives it HIGH on probe (before qcom-pcie touches
 *      PERST#).
 *   2. Registers itself as a dummy fixed-rate clock provider (#clock-cells = <0>),
 *      which qrb5165-eb5.dts adds to pcie1's clock list.  fw_devlink sees this
 *      phandle and guarantees that pcie1 will NOT be probed until this driver's
 *      probe() returns successfully.
 *   3. Schedules a deferred rescan after the ASM2806 cascade downstream links
 *      finish training.  The rescan explicitly programs bridge MEMORY_BASE/LIMIT
 *      registers to hardware — pci_assign_unassigned_bus_resources() only updates
 *      kernel data structures but does NOT write the bridge window registers,
 *      which would leave the RTL8168 endpoints inaccessible.
 *
 * Result: gpio141 is driven high BEFORE qcom_pcie_host_init() de-asserts PERST#,
 * so the ASM2806 bridge is powered and ready for config-space enumeration.
 *
 * Nothing in qcom.c / pcie-qcom.c is modified.
 */

#include <linux/module.h>
#include <linux/platform_device.h>
#include <linux/of.h>
#include <linux/delay.h>
#include <linux/gpio/consumer.h>
#include <linux/clk-provider.h>
#include <linux/clk.h>
#include <linux/pci.h>
#include <linux/workqueue.h>
#include <linux/io.h>

/* PCI domain 1, root bus 0 — matches pcie1 (1c08000.pcie) */
#define EB5_PCIE1_DOMAIN    1
#define EB5_PCIE1_ROOT_BUS    0

/*
 * Poll interval while waiting for pcie1 to create its root bus.
 * pcie1 probe can take 10-15 s at boot; we retry up to 30 times (30 s).
 */
#define EB5_POLL_INTERVAL_MS    1000
#define EB5_POLL_MAX_RETRIES    30

#define DRV_NAME "qcom-eb5-pcie1-helper"

struct eb5_pcie_helper {
    struct gpio_desc    *lan_en_gpio;
    struct clk_hw        clk_hw;
    struct delayed_work  rescan_work;
};

static const struct clk_ops eb5_lan_clk_ops = { /* no-op clock, ordering only */ };

/*
 * program_bridge_windows - write bridge memory windows to hardware registers.
 *
 * pci_assign_unassigned_bus_resources() assigns memory windows in the kernel's
 * resource tree but does NOT write PCI_MEMORY_BASE / PCI_MEMORY_LIMIT to the
 * bridge's config space.  Without this step the bridge does not forward memory
 * transactions downstream, so endpoint drivers fail on their very first MMIO
 * access.  Recurse into child buses so all levels of ASM2806 are programmed.
 */
static void program_bridge_windows(struct pci_bus *bus)
{
    struct pci_dev *dev;

    list_for_each_entry(dev, &bus->devices, bus_list) {
        struct resource *res;
        u16 cmd;

        if (!dev->subordinate)
            continue;

        /* Write 32-bit non-prefetchable memory window */
        res = &dev->resource[PCI_BRIDGE_MEM_WINDOW];
        if (resource_size(res) > 0) {
            u16 mem_base  = (res->start >> 16) & 0xfff0;
            u16 mem_limit = (res->end   >> 16) & 0xfff0;

            pci_write_config_word(dev, PCI_MEMORY_BASE,  mem_base);
            pci_write_config_word(dev, PCI_MEMORY_LIMIT, mem_limit);
            dev_info(&dev->dev, "bridge mem window programmed: %pR\n", res);
        }

        /* Write I/O window if present */
        res = &dev->resource[PCI_BRIDGE_IO_WINDOW];
        if (resource_size(res) > 0) {
            pci_write_config_byte(dev, PCI_IO_BASE,
                          (res->start >> 8) & 0xf0);
            pci_write_config_byte(dev, PCI_IO_LIMIT,
                          (res->end   >> 8) & 0xf0);
        }

        /* Enable bus-mastering and memory-space decoding on the bridge */
        pci_read_config_word(dev, PCI_COMMAND, &cmd);
        cmd |= PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER;
        pci_write_config_word(dev, PCI_COMMAND, cmd);

        program_bridge_windows(dev->subordinate);
    }
}

static void eb5_pcie1_rescan_work(struct work_struct *work)
{
    struct eb5_pcie_helper *h =
        container_of(work, struct eb5_pcie_helper, rescan_work.work);
    struct pci_bus *root_bus;
    int retries = 0;

    /* Re-assert gpio141 in case it was reset during suspend/resume */
    gpiod_set_value_cansleep(h->lan_en_gpio, 1);

    /*
     * pcie1 probe (and bus creation) can take 10-15 s at boot — much
     * longer than the helper's own probe.  Poll until the root bus
     * appears, then give the ASM2806 downstream links an extra second
     * to finish training before we scan.
     */
    while (retries < EB5_POLL_MAX_RETRIES) {
        root_bus = pci_find_bus(EB5_PCIE1_DOMAIN, EB5_PCIE1_ROOT_BUS);
        if (root_bus)
            break;
        pr_debug(DRV_NAME ": waiting for pcie1 bus (attempt %d/%d)\n",
             retries + 1, EB5_POLL_MAX_RETRIES);
        msleep(EB5_POLL_INTERVAL_MS);
        retries++;
    }

    if (!root_bus) {
        pr_err(DRV_NAME ": domain %u bus %02x not found after %d s, giving up\n",
               EB5_PCIE1_DOMAIN, EB5_PCIE1_ROOT_BUS, EB5_POLL_MAX_RETRIES);
        return;
    }

    /* Extra settling time for ASM2806 downstream link training */
    msleep(1000);

    pr_info(DRV_NAME ": rescanning pcie1 (domain %u bus %02x) after %d poll(s)\n",
        EB5_PCIE1_DOMAIN, EB5_PCIE1_ROOT_BUS, retries);

    /*
     * Issue a Secondary Bus Reset (SBR) via the root port's Bridge Control
     * register.  This pulses the downstream PERST# from the RC side without
     * needing direct GPIO access.  The ASM2806 may have been unresponsive
     * during the initial enumeration because its internal init was not yet
     * complete when qcom-pcie first de-asserted PERST# at probe time.
     * Driving SBR here — after gpio141 has been high for several seconds —
     * gives the ASM2806 a clean reset cycle with power already stable.
     */
    {
        struct pci_dev *rp = pci_get_domain_bus_and_slot(
                    EB5_PCIE1_DOMAIN, 0, PCI_DEVFN(0, 0));
        if (rp) {
            u16 bctl;

            pci_read_config_word(rp, PCI_BRIDGE_CONTROL, &bctl);
            /* Assert Secondary Bus Reset */
            pci_write_config_word(rp, PCI_BRIDGE_CONTROL,
                          bctl | PCI_BRIDGE_CTL_BUS_RESET);
            msleep(100); /* hold reset ≥ 100 ms (PCIe r3.0 §6.6.1) */
            /* De-assert Secondary Bus Reset */
            pci_write_config_word(rp, PCI_BRIDGE_CONTROL, bctl);
            msleep(500); /* wait for ASM2806 to finish link re-training */
            dev_info(&rp->dev,
                 DRV_NAME ": SBR pulse done, waiting for ASM2806\n");
            pci_dev_put(rp);
        } else {
            pr_warn(DRV_NAME ": root port not found, skipping SBR\n");
        }
    }

    pci_lock_rescan_remove();

    /* Step 1: discover new devices (ASM2806 cascade + RTL8168) */
    pci_scan_child_bus(root_bus);

    /* Step 2: assign BARs and bridge windows in kernel resource structs */
    pci_assign_unassigned_bus_resources(root_bus);

    /*
     * Step 3: write bridge MEMORY_BASE/LIMIT to hardware config space.
     * This is the step that pci_rescan_bus() / pci_assign_…() omit,
     * and without it the RTL8168 endpoints are unreachable via MMIO.
     */
    program_bridge_windows(root_bus);

    /* Step 4: add devices to driver model — triggers driver probes */
    pci_bus_add_devices(root_bus);

    pci_unlock_rescan_remove();

    pr_info(DRV_NAME ": rescan complete, bridges programmed\n");
}

/* Forward declaration for sysfs attribute */
static DEVICE_ATTR_WO(reset_asm2806);

static int eb5_pcie_helper_probe(struct platform_device *pwd)
{
    struct device *dev = &pwd->dev;
    struct eb5_pcie_helper *h;
    struct clk_init_data init = {};
    int ret;

    h = devm_kzalloc(dev, sizeof(*h), GFP_KERNEL);
    if (!h)
        return -ENOMEM;

    /*
     * Acquire gpio141 (ASM2806 bridge-enable) and drive it HIGH.
     * This must happen before qcom-pcie de-asserts PERST# on RC1.
     * The clock provider registration below ensures that qcom-pcie
     * does not even begin probing until after this point.
     */
    h->lan_en_gpio = devm_gpiod_get(dev, "lan-en", GPIOD_OUT_HIGH);
    if (IS_ERR(h->lan_en_gpio))
        return dev_err_probe(dev, PTR_ERR(h->lan_en_gpio),
                 "failed to get lan-en gpio\n");

    msleep(50); /* allow ASM2806 to become ready */
    dev_info(dev, "lan-en (gpio141) asserted high\n");

    /*
     * Register a zero-rate fixed clock so that fw_devlink can enforce
     * the probe ordering: pcie1 (which lists us in its 'clocks') will
     * not probe before of_clk_add_hw_provider() returns.
     */
    init.name = DRV_NAME;
    init.ops  = &eb5_lan_clk_ops;
    h->clk_hw.init = &init;

    ret = devm_clk_hw_register(dev, &h->clk_hw);
    if (ret)
        return dev_err_probe(dev, ret, "clk_hw_register failed\n");

    ret = devm_of_clk_add_hw_provider(dev, of_clk_hw_simple_get, &h->clk_hw);
    if (ret)
        return dev_err_probe(dev, ret,
                 "of_clk_add_hw_provider failed\n");

    /* Start polling immediately; the work itself waits for the bus */
    INIT_DELAYED_WORK(&h->rescan_work, eb5_pcie1_rescan_work);
    schedule_delayed_work(&h->rescan_work, 0);

    platform_set_drvdata(pwd, h);
    ret = device_create_file(dev, &dev_attr_reset_asm2806.attr);
    if (ret)
        dev_warn(dev, "failed to create reset_asm2806 sysfs: %d\n", ret);

    dev_info(dev, "clock provider registered, waiting for pcie1 bus\n");
    return 0;
}

static void eb5_pcie_helper_remove(struct platform_device *pdev)
{
    struct eb5_pcie_helper *h = platform_get_drvdata(pdev);

    cancel_delayed_work_sync(&h->rescan_work);
    device_remove_file(&pwd->dev, &dev_attr_reset_asm2806.attr);
}

static const struct of_device_id eb5_pcie_helper_of_match[] = {
    { .compatible = "qcom,eb5-pcie1-helper" },
    {}
};
MODULE_DEVICE_TABLE(of, eb5_pcie_helper_of_match);

static struct platform_driver eb5_pcie_helper_driver = {
    .probe          = eb5_pcie_helper_probe,
    .remove         = eb5_pcie_helper_remove,
    .driver = {
        .name           = DRV_NAME,
        .of_match_table = eb5_pcie_helper_of_match,
    },
};
builtin_platform_driver(eb5_pcie_helper_driver);

/* Sysfs interface to trigger full reset sequence */
static ssize_t reset_asm2806_store(struct device *dev,
                    struct device_attribute *attr,
                    const char *buf, size_t count)
{
    struct eb5_pcie_helper *h = dev_get_drvdata(dev);
    void __iomem *parf;
    struct pci_bus *root_bus;

    dev_info(dev, "triggering full ASM2806 reset sequence...\n");

    /* 1. Power cycle gpio141 */
    gpiod_set_value_cansleep(h->lan_en_gpio, 0);
    msleep(200);
    gpiod_set_value_cansleep(h->lan_en_gpio, 1);
    msleep(500);

    /* 2. Write bypass bit */
    parf = ioremap(0x1c0ac00, 4);
    if (parf) {
        writel(BIT(0), parf);
        dev_info(dev, "BDF_TO_SID_BYPASS set\n");
        iounmap(parf);
    }

    /* 3. Trigger SBR */
    root_bus = pci_find_bus(EB5_PCIE1_DOMAIN, EB5_PCIE1_ROOT_BUS);
    if (root_bus) {
        struct pci_dev *rp = pci_get_domain_bus_and_slot(
            EB5_PCIE1_DOMAIN, 0, PCI_DEVFN(0, 0));
        if (rp) {
            u16 bctl;
            pci_read_config_word(rp, PCI_BRIDGE_CONTROL, &bctl);
            pci_write_config_word(rp, PCI_BRIDGE_CONTROL,
                          bctl | PCI_BRIDGE_CTL_BUS_RESET);
            msleep(100);
            pci_write_config_word(rp, PCI_BRIDGE_CONTROL, bctl);
            msleep(500);
            pci_dev_put(rp);
            dev_info(dev, "SBR done\n");
        }
    }

    /* 4. Rescan */
    pci_lock_rescan_remove();
    pci_scan_child_bus(root_bus);
    pci_bus_add_devices(root_bus);
    pci_unlock_rescan_remove();

    dev_info(dev, "reset sequence complete\n");
    return count;
}

已执行测试(按大类)

1) DTS 基线与 include 链确认

  • 确认 qrb5165-eb5.dts -> qrb5165-rb5.dts -> sm8250.dtsi 链路正确。
  • 校对 pcie1 主线默认脚位语义:perst=82wake=84

2) pcie1 链路参数测试

  • &pcie1 设置:max-link-speed = <1>(降速到 Gen1)。
  • &pcie1 设置:num-lanes = <1>(单 lane 试验)。
  • 结果:0001 域仍仅有 root port,无桥后 endpoint。

3) IOMMU 映射扩展测试

  • iommu-map 从基础项扩展到多 BDF(覆盖 0x0..0x600 相关条目)。
  • 结果:未触发有线 endpoint 出现。

4) 供电影响排查

  • 增加/确认 vdda-supplyvddpe-3v3-supply
  • 结果:无显著变化,pcie1 仍空下游。

5) WAKE/RESET 脚位组合测试

  • wake-gpios 做过候选切换(22/84),最终回归 84(与主线/旧树一致)。
  • pcie@0/reset-gpios 做过极性 A/B 试验(active low/high 组合对比)。
  • 结果:仍未枚举到有线 endpoint。

6) 141 板级控制线测试

  • 早期方案:gpio141 作为 gpio-hog output-high(常高驱动)。
  • 最新方案:移除 hog,改为 pcie1 附加 pinctrl(pcie1_lan1_wake_defaultbias-pull-up 输入态),贴近旧树语义。
  • 结果:等待该版本实机回传确认。

7) pcie2 冲突隔离测试

  • &pcie2&pcie2_phy 设为 disabled
  • 结果:未改变 pcie1 无下游端点现象。

8) GPIO 共享告警处置

  • 观察到告警:gpio_shared_add_proxy_lookup / qcom_pcie_probe
  • 做过修复:删除 &pcie1perst-gpios,保留 pcie@0/reset-gpios,避免同线重复申请。
  • 新日志定位:告警主要落在 1c00000.pcie (pcie0),非 1c08000.pcie (pcie1)

9) 旧内核基线确认(4.19)

  • 在旧内核 4.19.125-perf-v2 下,0001 域可完整枚举:

    • 0001:01:00.0/0001:02:00.0/0001:02:06.0/0001:02:0e.0 均为 ASMedia 1b21:2806
    • 0001:04:00.00001:05:00.0RTL8111/8168 (10ec:8168)
  • 旧日志明确出现 RC1: asm2806(1),说明厂商侧存在 RC1 私有桥控制语义。

10) 主线运行时补偿脚本尝试(失败)

  • 在主线 6.19.0-dirty 运行 pcie1-kick.sh

    • gpioset gpiochip0 141=... 返回 Invalid argument
    • /sys/bus/platform/drivers/qcom-pcie/{bind,unbind} 不存在或不可用,无法运行时重绑 RC1
  • 结论:该平台当前不支持“运行时重绑+踢脚”路径,需回到“重启生效”的 DTS 单变量法。

11) 纯 DTS 单变量回归(最新)

  • 已将 gpio141 改回 gpio-hog output-high,并移除 pcie1pcie1_lan1_wake_default 引用。
  • 该版本仅改 gpio141 启动策略,便于与上一版直接对比。

12) 纯 DTS 单变量(本轮)

  • gpio141gpio-hog 改为 pcie1 设备级 pinctrl(pcie1_lan1_en_default),配置为 output-high
  • &pcie1pinctrl-0 调整为 <&pcie1_default_state>, <&pcie1_lan1_en_default>
  • 保持 wake-gpios=84reset-gpios=82qcom.c 不变,确保本轮仅变更 141 的所有权与生效时机。
  • 结果:等待该版本实机回传确认。

当前观测结论(最新)

  • 主线 6.19 lspci 稳定仅有 0001:00:00.0(root port),ASM2806 / RTL8168 不可见
  • vendor 4.19 完整枚举:ASM2806 × 4 + RTL8168 × 2,eth1 UP
  • 链路速度不是根因:Gen1 x1 下 config TLP 依然可以工作,vendor 4.19 跑 Gen3 x2 但根因另在
  • 已确认根因:主线 iommu-map 触发 BDF_TO_SID_BYPASS 被清除 → SMMU fault → 0xFFFF(见下方根因分析节)
  • 当前最新镜像(step 22)已将 iommu-map 删除,待实机验证

13) built-in helper driver + gpiod 主动驱动 gpio141(本轮)

背景:步骤 12 中 rescan 已成功执行(dmesg 显示 "PCI 1:00 rescan complete"),但 ASM2806 config space 返回全 0xFF,说明 gpio141 未被物理驱动。Qualcomm TLMM 的 pinctrl output-high 仅设置 mux,开机后不保证实际电平被 GPIO 控制器持续驱动。

变更内容(单变量):

  1. DTSeb5-pcie1-helper 节点加入 lan-en-gpios = <&tlmm 141 GPIO_ACTIVE_HIGH>
  2. DTS — 从 &pcie1 pinctrl-0 移除 pcie1_lan1_en_default;从 &tlmm 删除该 pinctrl block(gpio141 所有权唯一归 helper driver)
  3. 驱动pcie-qcom-eb5-helper.c:struct 加 struct gpio_desc *lan_en_gpio
  4. 驱动 — probe 时 devm_gpiod_get_optional(dev, "lan-en", GPIOD_OUT_HIGH) 请求并立即驱动 gpio141 为高
  5. 驱动 — worker 在 pci_find_bus() 前再次 gpiod_set_value_cansleep(h->lan_en_gpio, 1) + msleep(50) 确保 ASM2806 就绪

预期验证口径(刷机后):

dmesg | grep -E 'eb5-pcie1|lan-en|gpio141|rescan|0001:'
lspci -nn | grep '^0001:'
ip -br a

若仍无响应,追加:

lspci -vv -s 0001:00:00.0 | grep -E 'LnkSta|Speed|Width'

14) genpd provider 方案 — 让 pcie1 依赖 helper(本轮)

方案来源:建议写个像 power-domain 的驱动。

原理:将 helper 改为 generic_pm_domain (genpd) provider。&pcie1power-domains 列表加入 <&eb5_pcie1_helper>,内核 PM domain 框架会在 qcom-pcie probe 之前自动调用 helper 的 power_on 回调,保证 gpio141 在 RC1 第一次枚举时已经 asserted。彻底不需要 rescan、delayed_work 和时序猜测。

变更内容(单变量):

  1. 驱动重构 — 删除所有 pci.h / workqueue.h / delayed_work / pci_scan_child_bus 代码

    • 新结构体:struct eb5_lan_pd { struct generic_pm_domain genpd; struct gpio_desc *lan_en_gpio; }
    • probedevm_gpiod_get(GPIOD_OUT_HIGH) + msleep(50) + pm_genpd_init(is_off=false) + of_genpd_add_provider_simple()
    • removeof_genpd_del_provider() + pm_genpd_remove()
    • power_on 回调:gpiod_set_value_cansleep(1) + msleep(50)(供 suspend/resume 使用)
    • GENPD_FLAG_ALWAYS_ON:运行时不切断电源
  2. DTS — helper 节点加 #power-domain-cells = <0>; 并加 label eb5_pcie1_helper:
  3. DTS&pcie1 覆盖 power-domains = <&gcc PCIE_1_GDSC>, <&eb5_pcie1_helper>;power-domain-names = "gdsc", "lan-en";

时序保证

  • fw_devlink 使 pcie1 不会在 helper 注册 genpd provider 之前 probe
  • genpd 框架在 pcie1 上电时调用 power_on,gpio141 在 qcom_pcie_probe→link_up→enumeration 全链路之前已高

预期验证口径(刷机后):

dmesg | grep -E 'eb5-pcie1|lan-en|pcie1.*power|0001:'
lspci -nn | grep '^0001:'
ip -br a

15) 时钟 provider 排序方案(本轮)

根因分析(步骤 14 失败原因)

  • 步骤 14 添加 power-domains = <&gcc PCIE_1_GDSC>, <&eb5_pcie1_helper> 后,fw_devlink 和 genpd 两套机制各自为 pcie1↔helper 创建了一条 device_link。双 device_link 冲突导致 pci_register_host_bridge → request_resource 返回 -16 (EBUSY),pcie1 probe 失败。

当前方案(朋友思路的正确实现):

  • 不触碰 power-domains,改用 clock provider 做排序锚点:

    1. helper 注册一个 zero-rate dummy 时钟(#clock-cells = <0>
    2. qrb5165-eb5.dts 在 pcie1 的 clocks/clock-names 末尾加 <&eb5_pcie1_helper> "lan-en"
    3. fw_devlink 识别 clocks phandle,自动保证 pcie1 probe 在 helper probe 完成之后
    4. helper probe 时驱动 gpio141 high + msleep(50)of_clk_add_hw_provider 完成
    5. qcom-pcie 只 devm_clk_bulk_get 它知道的 9 个名字,"lan-en" 被忽略,无副作用

时序

  • T=1.2s: helper probe → gpio141 HIGH + clock provider 注册 → fw_devlink 放行 pcie1
  • T=6.4s: pcie1 probe → qcom_pcie_host_init → PERST# de-assert(gpio141 已 high ≥ 5s)

变更内容

  1. 驱动重构 — 移除所有 genpd/pm_domain 代码;改用 clk_init_data + devm_clk_hw_register + devm_of_clk_add_hw_provider
  2. DTS — helper 节点 #power-domain-cells#clock-cells = <0>
  3. DTS — pcie1 恢复原始 power-domains(不再覆盖),添加完整 clocks/clock-names 列表并追加 <&eb5_pcie1_helper> "lan-en"

预期 dmesg 口径

dmesg | grep -E 'eb5-pcie1|lan-en|1c08000.pcie|0001:'
lspci -nn | grep '^0001:'
ip -br a

应看到:

  1. eb5-pcie1-helper: lan-en (gpio141) asserted high @ ~1.2s
  2. eb5-pcie1-helper: clock provider registered... @ ~1.2s
  3. qcom-pcie 1c08000.pcie: PCI host bridge to bus 0001:00 @ ~6.x s(probe 不再 EBUSY)
  4. 0001:01:00.0 ... 1b21:2806(ASM2806)+ 0001:04:00.0 10ec:8168(RTL8168)

21) 修正为 Gen3 x2,重建镜像

变更(单变量):

  • qrb5165-eb5.dts&pcie1max-link-speed = <1><3>num-lanes = <1><2>
  • 其余不变(clock provider、gpio141、SBR 等保留)

实测结果(已刷机)

  • LnkSta 仍为 0x1011(Gen1 x1),即使写了 max-link-speed=3 也无法训练到 Gen3
  • 手动写 LnkCtl2 TLS=Gen3(setpci ... CAP_EXP+0x30.w=0x0003)再触发 retrain → 仍 Gen1 x1
  • PHY 寄存器层面 sm8250_qmp_gen3x2_pciephy_cfg 在主线确实存在(lanes=2,tbls_rc Gen3x2 表完整),但实际无法跑到 Gen3
  • ASM2806 config space 仍返回 0xffffffff
  • 结论:Gen1 x1 ≠ 根因。速度与 config TLP 可见性正交,Gen1 链路也应能读 config space

关键反转:步骤 21 之前的假设("Gen1 导致 ASM2806 不可见")已被证伪。继续在旧内核 4.19 上用 pci-msm debugfs 深挖寄存器差异。

22) 修复:/delete-property/ iommu-map(当前待验证)

变更(单变量,仅改 DTS):

  • qrb5165-eb5.dts&pcie1 中,将多条 iommu-map 条目替换为 /delete-property/ iommu-map;
  • 这使 qcom_pcie_config_sid_1_9_0()size=0 提前返回,不清除 BDF_TO_SID_BYPASS bit
  • 其余不变(max-link-speed=3, num-lanes=2, clock provider, gpio141 均保留)

镜像哈希53d1472f5be3b1d28131ff79752dce537176661193bf33100089ea4e1bcd6066

验证口径(刷机后):

dmesg | grep -E '0001:|1c08000|eb5-pcie1'
lspci -nn | grep '^0001:'   # 期望:1b21:2806 + 10ec:8168
ip -br a                    # 期望:eth0/eth1 UP
# 若 ASM2806 出现但 eth DOWN,追查:
dmesg | grep -E 'smmu.*fault|iommu.*error|r8169|rtl8168'

预期原理:bypass=1 → SMMU 透传所有 PCIe CplD → ASM2806 config read 正常返回 0x1b212806

22) 实测结果(步骤 22 镜像,主线 6.19)

板上 dmesg 关键输出

[    1.232651] qcom-eb5-pcie1-helper eb5-pcie1-helper: lan-en (gpio141) asserted high
[    1.232708] qcom-eb5-pcie1-helper eb5-pcie1-helper: clock provider registered, waiting for pcie1 bus
[   14.151749] qcom-eb5-pcie1-helper: rescanning pcie1 (domain 1 bus 00) after 8 poll(s)
[   14.159817] qcom-eb5-pcie1-helper: BDF_TO_SID_BYPASS re-asserted before SBR
[   14.791767] pcieport 0001:00:00.0: qcom-eb5-pcie1-helper: SBR pulse done, waiting for ASM2806
[   14.824456] qcom-eb5-pcie1-helper: rescan complete, bridges programmed
lspci -nn | grep '^0001:'
0001:00:00.0 PCI bridge [0604]: Qualcomm Device [17cb:010b]   ← 仅 root port,ASM2806 未出现 ❌

诊断数据

项目结果
LnkStaSpeed 2.5GT/s (downgraded), Width x1 (downgraded) — 链路已训练 ✅
gpio141 (debugfs)out high func0 2mA pull down — 逻辑高但驱动力弱、内部下拉有效 ⚠️
PARF_BDF_TO_SID_CFG (devmem)0x00000001 — bypass=1 已生效 ✅
/sys/bus/pci/devices/0001:00:00.0 — bus 01 无设备 ❌

根因分析(两处 bug)

Bug 1 — SBR 打断已训练链路:rescan_work 中 SBR 将下游链路重置,500 ms 等待不足以完成重新训练。证据:
"SBR pulse done" (T=14.791) → "rescan complete" (T=14.824),仅 33 ms
pci_scan_child_bus 在链路还未训练完成时扫描,所有 config 读返回 0xFFFF(超时),ASM2806 未出现。
SBR 结束后链路自行训练完毕(LnkSta 此时已显示 Gen1 x1),但扫描已经结束。

Bug 2 — BDF_TO_SID_BYPASS 在首次扫描时尚未置位:helper 的 probe() 仅注册时钟 provider 后让 pcie1 解锁,
但 bypass bit 只在 rescan_work(T≈14s)中才写入。pcie1 首次扫描(T≈12s)时 bypass=0,
SMMU 对所有下游 completion 执行 BDF→SID 查表,无 context bank → fault → 0xFFFF

Bug 3 — gpio141 驱动强度不足:移除 pinctrl 后,TLMM 硬件默认为 2mA + pull-down
输出为高但驱动力弱、内部下拉使电气余量降低。


23) 修复:probe() 预置 bypass + 移除 SBR + 恢复 gpio141 pinctrl

变更内容(三处修复):

1. 驱动 pcie-qcom-eb5-helper.c

  • probe():在 of_clk_add_hw_provider() 之前(即在 pcie1 解锁之前)写入 BDF_TO_SID_BYPASS=1(read-modify-write),确保 pcie1 首次扫描时 bypass 已生效
  • rescan_work():删除 SBR 全段(链路已由 qcom-pcie 在 probe 时训练完毕,SBR 只会打断它);bypass 仍做 read-modify-write re-assert(防止 suspend/resume 清除)
  • reset_asm2806_store():bypass 写入改为 read-modify-write

2. DTS qrb5165-eb5.dts

  • 恢复 helper 节点的 pinctrl:drive-strength = <16>; bias-disable; output-high;(gpio141)
  • 修正 gpio141 电气配置,消除 2mA pull down 状态

预期启动时序(修复后)

~1.2s  eb5-pcie1-helper: lan-en (gpio141) asserted high
~1.2s  eb5-pcie1-helper: BDF_TO_SID_BYPASS asserted before pcie1 probe
~1.2s  eb5-pcie1-helper: clock provider registered, waiting for pcie1 bus
~6-12s qcom-pcie 1c08000.pcie: PCI host bridge to bus 0001:00
       0001:01:00.0 1b21:2806  ← 首次扫描即发现,无需 rescan
       0001:04:00.0 10ec:8168

验证口径(刷机后):

dmesg | grep -E 'eb5-pcie1|BDF_TO_SID|0001:'
lspci -nn | grep '^0001:'   # 期望:1b21:2806 + 10ec:8168
ip -br a                    # 期望:eth1/eth2 UP
cat /sys/kernel/debug/gpio | grep 141  # 期望:out high func0 16mA no pull

旧内核(4.19)pci-msm debugfs 深度探查

22-vendor) 确认旧内核链路和设备状态

板上执行(vendor 4.19):

ip -br a
lo               UNKNOWN  127.0.0.1/8 ::1/128
bond0            DOWN
dummy0           UNKNOWN  fe80::d3f:ae5d:b7ab:4c76/64
eth1             UP       192.168.1.185/24 ...   ← NIC working ✅
tailscale0       UNKNOWN  100.64.0.70/32 ...
wlan0            UP       192.168.6.1/24 ...

lspci -nn | grep '^0001:'
0001:00:00.0 PCI bridge [0604]: Qualcomm Device [17cb:010b]
0001:01:00.0 PCI bridge [0604]: ASMedia Technology Inc. Device [1b21:2806] (rev 01) ✅
0001:02:00.0 PCI bridge [0604]: ASMedia Technology Inc. Device [1b21:2806] (rev 01) ✅
0001:02:06.0 PCI bridge [0604]: ASMedia Technology Inc. Device [1b21:2806] (rev 01) ✅
0001:02:0e.0 PCI bridge [0604]: ASMedia Technology Inc. Device [1b21:2806] (rev 01) ✅
0001:04:00.0 Ethernet controller [0200]: Realtek ... RTL8111/8168 [10ec:8168] (rev 15) ✅
0001:05:00.0 Ethernet controller [0200]: Realtek ... RTL8111/8168 [10ec:8168] (rev 15) ✅

23-vendor) pci-msm debugfs case 9-32 探查结果

执行(vendor 4.19,rc_sel=2):

echo 2 > /sys/kernel/debug/pci-msm/rc_sel
for i in 9 10 11 12 13 14 15 16 20 21 22 32; do
    echo "=== case $i ==="; echo $i > case; dmesg | tail -5; sleep 0.3
done

结果摘要

case含义输出关键内容
9disable L1ssPCIe: RC1: disable L1ss
10enable L1ssPCIe: RC1: enable L1ss
11enumerate RC1PCIe: RC1 is already enumerated(不重枚举)
12read PARF registerbase: parf: 0xffffff80089d8000wr_offset: 0x2c0value: 0x0
13write PARF registerparf+0x2c0 = 0x0(vendor 的 wr_offset=0x2c0 对应 PARF 内一个不同寄存器,非主线 PARF_BDF_TO_SID_CFG=0x2c00
14write PARF register同 case 13(延续 wr_offset/mask/value 设置)
15map LBAR 4K DDR映射 DDR + LBAR=0x40008000 到内核虚地址 0xffffff8010e4d000
16unmap LBAR释放 case 15 的映射
20Read DDR valuesDDR is not mapped(需先做 case 15)
21Read LBAR valuesLBAR address is not mapped(需先做 case 15)
22Write 0x1 to DDRDDR address is not mapped
32set target speed Gen3PCIe: RC1: set target speed to Gen 3

重点

  • case=12 读 parf+0x2c0 值为 0x0 → vendor 4.19 该偏移为零,不对应主线的 PARF_BDF_TO_SID_CFG(主线定义为 0x2c00,差了一个零)
  • case=15 LBAR=0x40008000(vendor driver 的 debug ioremap 窗口,非 mainline ATU 0x40001000
  • gpio debugfs 无输出cat /sys/kernel/debug/gpio | grep -E ' 82 | 141 ' → 空,说明 vendor 4.19 gpio 82/141 未以 debugfs 方式暴露(可能走的是 msm_pcie 私有接口)

24-vendor) pci-msm case=4 shadow dump 解析

执行(之前已获取):

echo 2 > /sys/kernel/debug/pci-msm/rc_sel
echo 4 > /sys/kernel/debug/pci-msm/case

输出(RTL8168 at 05:00.0 的 shadow registers):

shadow_dw[4]:cfg  0x10: 0x40202001   # BAR0 low (64-bit prefetchable, addr=0x40202000)
shadow_dw[6]:cfg  0x18: 0x40404004   # Type-1 bridge bus numbers
shadow_dw[8]:cfg  0x20: 0x40400004   # Memory Base/Limit
shadow_dw[20]:cfg 0x50: 0x817005     # PCIe capability
shadow_dw[30]:cfg 0x78: 0x10501f     # Link cap: Gen3 x2
shadow_dw[32]:cfg 0x80: 0x10110002   # LnkSta2/LnkCtl2
shadow_dw[44]:cfg 0xb0: 0x30011      # L1 sub-state cap

→ vendor RTL8168 BAR 被正确分配在 0x40202000,内存窗口 0x40400000 区域正常。


根因分析(最终结论)

代码路径溯源

  1. 主线 ops_1_9_0pcie-qcom.c)对应 qcom,pcie-sm8250,其 .config_sid = qcom_pcie_config_sid_1_9_0
  2. 该函数逻辑:

    of_get_property(dev->of_node, "iommu-map", &size);
    if (!size)
        return 0;           // ← 有 iommu-map 才往下走
    val = readl(pcie->parf + PARF_BDF_TO_SID_CFG);  // PARF_BDF_TO_SID_CFG = 0x2c00
    val &= ~BDF_TO_SID_BYPASS;   // ← 清除 bypass bit(BIT(0))
    writel(val, pcie->parf + PARF_BDF_TO_SID_CFG);
    // 然后填充 BDF-to-SID 哈希表...
  3. eb5.dts 中 &pcie1 有多条 iommu-map(BDF 0x0000→0x0600),导致 config_sid 被调用
  4. bypass bit(BIT(0))被清除 → SMMU 对所有下游设备的 CplD 执行 BDF→SID 查表
  5. PCI 枚举阶段,iommu_attach_device() 尚未被调用(它发生在 device_add() 之后,即第一次 config read 成功之后)
  6. SMMU 查 SID 0x1c81(ASM2806,BDF=0x0100)→ 无 context bank → fault → Cpl 被丢弃 → CPU 读到 0xffffffff

vendor 4.19 为何正常

  • out-of-tree msm_pcie 驱动从不调用任何等价 config_sid 逻辑
  • BDF_TO_SID_BYPASS(硬件上电默认值 = 1,即 bypass 开启)始终保持
  • 所有 PCIe completions 绕过 SMMU SID 查表,直达 CPU → ASM2806 config read 正常返回

对比表

项目vendor 4.19 (msm_pcie)主线 6.19 (pcie-qcom.c)
驱动out-of-tree,特有 RC 初始化主线,通用 DWC ops_1_9_0
config_sid 调用从不调用iommu-map 非空时调用
BDF_TO_SID_BYPASS= 1(硬件默认,从未清除)= 0(被主动清除)
ASM2806 CplD 路径绕过 SMMU → CPU ✅SMMU fault → 丢包 ❌
LnkStaGen3 x2Gen1 x1(PHY 实际不支持 Gen3)

24) 修复:移除 probe() 中的 PARF 写入(当前待验证)

现象:步骤 23 镜像刷机后,UEFI 交接给内核后无任何串口输出(既不 reboot 也不打印),无法看到 earlycon 输出。

根因分析

  • helper 驱动为 builtin_platform_driver,在 initcall 阶段 probe,早于 pcie1 probe(这是 fw_devlink 时钟排序的设计目标)
  • pcie1 的 GDSC(PCIE_1_GDSC)在 pcie1 自身 probe 时才被使能
  • helper 的 probe() 中尝试 ioremap(0x01c0ac00) + readl/writel,但此时 pcie1 GDSC 未开启,PCIe PARF 寄存器组尚未上电
  • 访问未上电的 PARF → NoC/AXI 总线返回错误 → SError 中断 → 内核 panic
  • panic 发生在 earlycon 初始化之前,串口无任何输出

为何之前可工作(步骤 22)

  • 步骤 22 的 probe() 中没有 PARF 访问,bypass 写入仅在 rescan_work() 中(彼时 pcie1 GDSC 已开启)
  • 步骤 23 错误地将 bypass 写入移到了 probe(),引发上电时序问题

变更(单变量,仅驱动):

  • pcie-qcom-eb5-helper.c:从 probe() 中删除 ioremap / readl / writel / iounmap PARF 写入块
  • rescan_work() 中的 bypass re-assert 保持不变(此时 pcie1 GDSC 已开启,PARF 可访问)

启动时序(修复后预期)

~1.2s  eb5-pcie1-helper: lan-en (gpio141) asserted high
~1.2s  eb5-pcie1-helper: clock provider registered, waiting for pcie1 bus
~6-12s qcom-pcie 1c08000.pcie: PCI host bridge to bus 0001:00   ← 首次扫描
       (bypass 尚未写入 → 初次扫描可能 ASM2806 不可见,属正常)
~13-43s eb5-pcie1-helper: rescanning pcie1 (domain 1 bus 00)
        eb5-pcie1-helper: BDF_TO_SID_BYPASS re-asserted   ← GDSC 已开,写入安全
        0001:01:00.0 1b21:2806   ← 期望出现
        0001:04:00.0 10ec:8168

验证口径(刷机后):

dmesg | grep -E 'eb5-pcie1|BDF_TO_SID|0001:|panic'
lspci -nn | grep '^0001:'   # 期望:1b21:2806 + 10ec:8168
ip -br a                    # 期望:eth1/eth2 UP
cat /sys/kernel/debug/gpio | grep 141  # 期望:out high func0 16mA no pull

若首次扫描已见 ASM2806(bypass 硬件默认 = 1 且 /delete-property/ iommu-map 生效):

则无需 rescan,eth1/eth2 应在 ~12s 后自动 UP,无需等待 rescan_work。


24) 实测结果(步骤 24 镜像)

板上状态

LnkSta: Speed 2.5GT/s (downgraded), Width x1 (downgraded)   ← 链路已训练 ✅
BDF_TO_SID_CFG (0x01c0ac00) = 0x00000001                    ← bypass=1 ✅
gpio141: out high func0 16mA no pull                         ← 电气正确 ✅

仍未找到 ASM2806 ❌。rescan 在 31ms 内完成,bridge mem window 显示 disabled。

根因(步骤 24 新增 bug)

iommu: Default domain type: Translated 表明所有未分配 IOMMU group 的设备,其 DMA 事务被 SMMU 静默丢弃(无日志)。

步骤 22 的 /delete-property/ iommu-map 修复了 config_sid 清除 bypass 的问题,但同时移除了 pcie1 的 IOMMU group,导致 SMMU 没有为 pcie1 建立 context bank。

bypass=1 只跳过 PARF 内的 BDF→SID 查表,不绕过 SMMU 本身:completion 到达 SMMU 时使用 bypass SID(= SID 0x1c80,root port 的 SID),而无 context bank → SMMU 静默丢弃 → ASM2806 config read 读到 0xFFFF → 未找到。

完整机制(iommu-map 两条 entry 的作用):

时机事件
pcie1 probeconfig_sid() 写入 BDF→SID 表(0x0→0x1c80, 0x100→0x1c81),然后清除 bypass=0
初次扫描root port(BDF=0x0) 被找到 → iommu_attach_device → SID 0x1c80 context bank 建立 ✅
初次扫描ASM2806(BDF=0x100) completion 用 SID 0x1c81,无 context bank → SMMU fault → 0xFFFF ❌
rescan_workbypass=1 → 所有 completion 改用 bypass SID 0x1c80 → context bank 已存在 → 通过 ✅

25) 修复:恢复 iommu-map(当前待验证)

变更(单变量,仅 DTS):

  • qrb5165-eb5.dts 中删除 /delete-property/ iommu-map,令 iommu-mapsm8250.dtsi 正常继承
  • 其余不变(驱动 probe() 无 PARF 写、rescan_work bypass RMW、无 SBR、gpio141 pinctrl)

预期启动时序

~1.2s  eb5-pcie1-helper: lan-en (gpio141) asserted high
~1.2s  eb5-pcie1-helper: clock provider registered
~6-12s qcom-pcie 1c08000.pcie: PCI host bridge to bus 0001:00  ← config_sid 建立 CB[0x1c80],清除 bypass=0
       初次扫描:仅 root port,ASM2806 SMMU fault(预期)
~13s   eb5-pcie1-helper: BDF_TO_SID_BYPASS re-asserted         ← bypass=1,CB[0x1c80] 已存在
       0001:01:00.0 1b21:2806   ← rescan 时 bypass SID 有 context bank
       0001:04:00.0 10ec:8168

验证口径(刷机后):

dmesg | grep -E 'eb5-pcie1|BDF_TO_SID|0001:|smmu.*fault'
lspci -nn | grep '^0001:'   # 期望:1b21:2806 + 10ec:8168
ip -br a                    # 期望:eth1/eth2 UP

26) 步骤 25 实测:DLActive 轮询超时 30s(Boot 1)

现象(dmesg 关键行):

eb5-pcie1-helper: waiting for DLActive (try 1/30)
...
eb5-pcie1-helper: DLActive never asserted after 30 s

iommu-map 恢复、bypass re-assert 均正确。但 rescan_work 等待 PCI_EXP_LNKSTA_DLLLA
(bit 13,= 0x2000)始终为 0,30 s 后放弃,不执行 rescan → ASM2806 未枚举。

根因rescan_work 中调用 pci_set_power_state(rp, PCI_D0) 只写 PCI PM 寄存器,
不触发 qcom-pcie 平台设备的 runtime_resume(clocks/PHY 未唤醒)。
链路 LTSSM 处于低功耗状态,DLL 不处于 Active → DLLLA 永远为 0。

修复:用 pm_runtime_resume_and_get(hw_dev) 先唤醒 qcom-pcie 平台设备(clocks/PHY),
再用 pm_runtime_resume_and_get(&rp->dev) 唤醒 pcieport pci_dev,然后触发链路重训练。


27) Boot 2:pm_runtime 加入后 LnkSta=0x1011 持续 30s

变更:在 rescan_work 中改用双层 pm_runtime_resume_and_get(平台设备 + pcieport)。
触发 PCI_EXP_LNKCTL_RL(link retrain)后,轮询 LnkSta

现象(dmesg 关键行):

eb5-pcie1-helper: qcom-pcie platform device resumed (was suspended=no)
eb5-pcie1-helper: root port in D0, resuming via runtime PM
eb5-pcie1-helper: link retrain triggered (LnkCtl=0x0041)
eb5-pcie1-helper: waiting for link-up (LnkSta=0x1011, 0/30 s)
eb5-pcie1-helper: waiting for link-up (LnkSta=0x1011, 5/30 s)
...
eb5-pcie1-helper: link never came up after 30 s (last LnkSta=0x1011)

发现LnkSta=0x1011 解码:

字段含义
CLS (bits 3:0)0x1Gen1 (2.5 GT/s) — 链路已训练
NLW (bits 9:4)0x1x1 宽度
SLC (bit 12)1Slot Clock Config
DLLLA (bit 13)0DLL 未报告 Active

链路物理上已训练完成(Gen1 x1),但 DLLLA = 0。
旧轮询条件 LnkSta & PCI_EXP_LNKSTA_DLLLA 永远不满足 → 超时。

根因PCI_EXP_LNKCAP_DLLLARC(LnkCap bit 20)= 0,
DWC 硬件上电默认值不置位此 bit,导致 DLLLA 字段只读且始终为 0。
见 PCIe spec §7.8.8:只有 DLLLARC=1 时 DLLLA 才反映真实 DLL 状态。

临时修复:将轮询条件改为"速度非零 + 宽度非零 + 未在训练中",
不再依赖 DLLLA:

if ((lnksta & PCI_EXP_LNKSTA_CLS) &&
    (lnksta & PCI_EXP_LNKSTA_NLW) &&
    !(lnksta & PCI_EXP_LNKSTA_LT))
    /* link up */

28) Boot 3:链路检测通过,但 rescan 仍找不到 ASM2806

现象

eb5-pcie1-helper: link up Gen1 x1 (LnkSta=0x1011) after 0 s
eb5-pcie1-helper: BDF_TO_SID_BYPASS re-asserted
eb5-pcie1-helper: rescanning...
(无 0001:01:00.0 输出)

根因溯源

pci_scan_child_bus()pci_scan_slot()pci_bus_read_config_dword()
pci_bus_ops->read()dw_pcie_other_conf_map_bus():

static struct pci_bus *dw_pcie_other_conf_map_bus(...)
{
    if (!dw_pcie_link_up(pci))   /* <-- 这里 */
        return NULL;
    ...
}

dw_pcie_link_up()pci->ops->link_up()qcom_pcie_link_up():

static bool qcom_pcie_link_up(struct dw_pcie *pci)
{
    u16 val;
    pcie_capability_read_word(pci->dev, PCI_EXP_LNKSTA, &val);
    return !!(val & PCI_EXP_LNKSTA_DLLLA);   /* DLLLA=0 → 返回 false */
}

因 DLLLARC=0,DLLLA 永远为 0 → qcom_pcie_link_up() 永远返回 false →
dw_pcie_other_conf_map_bus() 返回 NULL → bus 1 的 config read 全返回 0xFFFF →
ASM2806 读到 VID=0xFFFF → 未枚举。

这是真正的根因:不是 bypass,不是 iommu-map,而是 DLLLA=0 导致 DWC 驱动
自己拒绝发出 bus 1 的 config TLP。


29) 根因修复:设置 LnkCap.DLLLARC via DBI_RO_WR_EN

方法:DWC 有 PCIE_MISC_CONTROL_1_OFF(DBI 偏移 0x8BC),bit 0 = DBI_RO_WR_EN
置位后,原本只读的 DBI 寄存器(含 LnkCap)可写。
LnkCap |= PCI_EXP_LNKCAP_DLLLARC,再清除 DBI_RO_WR_EN
之后做一次 Secondary Bus Reset(SBR)让 LTSSM 重走 Detect→L0,
此后 DLLLA 会被硬件正确维护为 1。

sm8250.dtsi 中 pcie1 DBI 基地址 = 0x40000000reg-names = "dbi")。

新增函数 eb5_enable_dllla_reporting()

#define EB5_PCIE1_DBI_PHYS      0x40000000UL
#define PCIE_MISC_CONTROL_1_OFF 0x8BC
#define PCIE_DBI_RO_WR_EN       BIT(0)

static void eb5_enable_dllla_reporting(struct pci_dev *rp)
{
    void __iomem *dbi;
    u32 mc1, lnkcap;
    u16 lnksta;

    dbi = ioremap(EB5_PCIE1_DBI_PHYS, 0x1000);
    if (!dbi) {
        pr_warn(DRV_NAME ": cannot ioremap DBI, DLLLARC not set\n");
        return;
    }
    mc1 = readl(dbi + PCIE_MISC_CONTROL_1_OFF);
    writel(mc1 | PCIE_DBI_RO_WR_EN, dbi + PCIE_MISC_CONTROL_1_OFF);
    lnkcap = readl(dbi + rp->pcie_cap + PCI_EXP_LNKCAP);
    writel(lnkcap | PCI_EXP_LNKCAP_DLLLARC,
           dbi + rp->pcie_cap + PCI_EXP_LNKCAP);
    pr_info(DRV_NAME ": LnkCap 0x%08x -> 0x%08x (DLLLARC set)\n",
            lnkcap, lnkcap | PCI_EXP_LNKCAP_DLLLARC);
    writel(mc1, dbi + PCIE_MISC_CONTROL_1_OFF);   /* 恢复 RO 保护 */
    iounmap(dbi);
    pci_read_config_word(rp, rp->pcie_cap + PCI_EXP_LNKSTA, &lnksta);
    pr_info(DRV_NAME ": LnkSta=0x%04x DLLLA=%d (will be 1 after SBR)\n",
            lnksta, !!(lnksta & PCI_EXP_LNKSTA_DLLLA));
}

注:PCI_EXP_LNKCAP_DLLLARC 已在 <linux/pci_regs.h> 中定义为 0x00100000
不重复 #define

SBR 序列rescan_work 中,紧接 eb5_enable_dllla_reporting 之后):

/* SBR: 100ms assert + 500ms retrain wait */
{
    struct pci_dev *rp_sbr = pci_get_domain_bus_and_slot(EB5_PCIE1_DOMAIN, 0, PCI_DEVFN(0,0));
    if (rp_sbr) {
        u16 bctl, lnksta;
        pci_read_config_word(rp_sbr, PCI_BRIDGE_CONTROL, &bctl);
        pci_write_config_word(rp_sbr, PCI_BRIDGE_CONTROL, bctl | PCI_BRIDGE_CTL_BUS_RESET);
        msleep(100);
        pci_write_config_word(rp_sbr, PCI_BRIDGE_CONTROL, bctl);
        msleep(500);
        pci_read_config_word(rp_sbr, rp_sbr->pcie_cap + PCI_EXP_LNKSTA, &lnksta);
        pr_info(DRV_NAME ": post-SBR LnkSta=0x%04x DLLLA=%d\n",
                lnksta, !!(lnksta & PCI_EXP_LNKSTA_DLLLA));
        pci_dev_put(rp_sbr);
    }
}

期望:post-SBR LnkSta=0x3011(bit 13 = DLLLA=1),qcom_pcie_link_up() 返回 true。


30) Boot 4:SBR 后仍找不到设备(DLLLARC 写入未测试)

现象

eb5-pcie1-helper: link up Gen1 x1 (LnkSta=0x1011) after 0 s
eb5-pcie1-helper: BDF_TO_SID_BYPASS re-asserted
eb5-pcie1-helper: post-SBR LnkSta=0x1011 DLLLA=0
(无 0001:01:00.0)

SBR 触发正确,但 DLLLA 仍为 0。
原因:此版本 eb5_enable_dllla_reporting() 尚未合并,DLLLARC 未写入。


31) Boot 5:pci_get_slot 竞争导致 "root port device not found"

变更:合并 eb5_enable_dllla_reporting()(DBI_RO_WR_EN + DLLLARC 写入)。

现象

eb5-pcie1-helper: root port device not found after 500 ms

rescan_work 调用 pci_find_bus() 成功(bus 结构已创建),
但紧接着 pci_get_slot(root_bus, 0) 失败(root port 的 pci_dev 尚未
pci_host_probe() 加入,约 13 ms 的窗口)。

修复:添加重试循环,最多 50×10 ms = 500 ms:

int slot_try;
for (slot_try = 0; slot_try < 50; slot_try++) {
    rp = pci_get_slot(root_bus, 0);
    if (rp)
        break;
    msleep(10);
}
if (!rp) {
    pr_err(DRV_NAME ": root port device not found after 500 ms\n");
    return false;
}

附注:此 boot 同时发现 format string 中有 UTF-8 箭头
gcc 产生警告。改为 ASCII -> 消除警告。


32) 当前代码状态(Boot 6 前,待验证)

pcie-qcom-eb5-helper.c 关键流程

probe():
  1. gpio141 驱动高(ASM2806 enable,PERST# 之前)
  2. 注册 dummy clock provider(#clock-cells = <0>)
  3. 调度 rescan_work(延迟 10s)

rescan_work():
  1. pci_find_bus(domain=1, busnum=0),最多等待 60s
  2. pci_get_slot() 重试最多 500ms(修复竞争)
  3. pm_runtime_resume_and_get(hw_dev)  ← qcom-pcie 平台设备
  4. pm_runtime_resume_and_get(&rp->dev) ← pcieport pci_dev
  5. 触发 PCI_EXP_LNKCTL_RL(link retrain)
  6. 轮询链路(速度+宽度+非训练中),最多 30s
  7. ioremap(0x01c0ac00) → BDF_TO_SID_BYPASS re-assert(RMW BIT(0)=1)
  8. eb5_enable_dllla_reporting(): DBI_RO_WR_EN + DLLLARC 写入
  9. SBR: PCI_BRIDGE_CTL_BUS_RESET 100ms,等待 500ms
 10. 读 post-SBR LnkSta,期望 DLLLA=1
 11. pci_lock_rescan_remove()
 12. pci_scan_child_bus() + assign resources + add devices
 13. pci_unlock_rescan_remove()
 14. pm_runtime_put(&rp->dev) + pm_runtime_put(hw_dev)
 15. pci_dev_put(rp)

qrb5165-eb5.dts 关键配置

  • iommu-map:9 条 entry(SID 0x1c80–0x1c88),预建所有下游 context bank
  • max-link-speed = <1>:强制 Gen1,避免 DWC 发起 Gen3 equalization
  • num-lanes = <2>
  • clocks 末尾加 <&eb5_pcie1_helper> + clock-names 末尾 "lan-en"
    fw_devlink 确保 helper probe 先于 qcom-pcie
  • eb5_pcie1_helper 节点:lan-en-gpios = <&tlmm 141>#clock-cells = <0>
  • pcie1_lan_en_default:gpio141,16mA,bias-disable,output-high

待验证(Boot 6 期望)

eb5-pcie1-helper: LnkCap 0x... -> 0x... (DLLLARC set)
eb5-pcie1-helper: post-SBR LnkSta=0x3011 DLLLA=1
0001:01:00.0 PCI bridge [0604]: ASMedia Technology Inc. Device [1b21:2806]
0001:04:00.0 Ethernet controller [0200]: Realtek ... [10ec:8168]

Last Modified: March 23, 2026