最近在研究D1跑裸机程序,工程使用xboot大佬的例程https://whycan.com/t_6683.html/,发现运行很慢,遂简单测量运行速度,cpu配置和测量程序如下所示:
static inline uint64_t counter(void)
{
uint64_t cnt;
__asm__ __volatile__("csrr %0, time\n" : "=r"(cnt) :: "memory");
return cnt;
}
static void sdelay(unsigned long us)
{
uint64_t t1 = counter();
uint64_t t2 = t1 + us * 24;
do {
t1 = counter();
} while(t2 >= t1);
}
static void set_pll_cpux_axi(void)
{
uint32_t val;
/* Select cpux clock src to osc24m, axi divide ratio is 3, system apb clk ratio is 4 */
write32(D1_CCU_BASE + CCU_RISCV_CLK_REG, (0 << 24) | (3 << 8) | (1 << 0));
sdelay(1);
/* Disable pll gating */
val = read32(D1_CCU_BASE + CCU_PLL_CPU_CTRL_REG);
val &= ~(1 << 27);
write32(D1_CCU_BASE + CCU_PLL_CPU_CTRL_REG, val);
/* Enable pll ldo */
val = read32(D1_CCU_BASE + CCU_PLL_CPU_CTRL_REG);
val |= (1 << 30);
write32(D1_CCU_BASE + CCU_PLL_CPU_CTRL_REG, val);
sdelay(5);
/* Set default clk to 1008mhz */
val = read32(D1_CCU_BASE + CCU_PLL_CPU_CTRL_REG);
val &= ~((0x3 << 16) | (0xff << 8) | (0x3 << 0));
val |= (41 << 8);
write32(D1_CCU_BASE + CCU_PLL_CPU_CTRL_REG, val);
/* Lock enable */
val = read32(D1_CCU_BASE + CCU_PLL_CPU_CTRL_REG);
val |= (1 << 29);
write32(D1_CCU_BASE + CCU_PLL_CPU_CTRL_REG, val);
/* Enable pll */
val = read32(D1_CCU_BASE + CCU_PLL_CPU_CTRL_REG);
val |= (1 << 31);
write32(D1_CCU_BASE + CCU_PLL_CPU_CTRL_REG, val);
/* Wait pll stable */
while(!(read32(D1_CCU_BASE + CCU_PLL_CPU_CTRL_REG) & (0x1 << 28)));
sdelay(20);
/* Enable pll gating */
val = read32(D1_CCU_BASE + CCU_PLL_CPU_CTRL_REG);
val |= (1 << 27);
write32(D1_CCU_BASE + CCU_PLL_CPU_CTRL_REG, val);
/* Lock disable */
val = read32(D1_CCU_BASE + CCU_PLL_CPU_CTRL_REG);
val &= ~(1 << 29);
write32(D1_CCU_BASE + CCU_PLL_CPU_CTRL_REG, val);
sdelay(1);
/* Set and change cpu clk src */
val = read32(D1_CCU_BASE + CCU_RISCV_CLK_REG);
val &= ~(0x07 << 24 | 0x3 << 8 | 0xf << 0);
val |= (0x05 << 24 | 0x1 << 8);
write32(D1_CCU_BASE + CCU_RISCV_CLK_REG, val);
sdelay(1);
}
static void set_pll_periph0(void)
{
uint32_t val;
/* Periph0 has been enabled */
if(read32(D1_CCU_BASE + CCU_PLL_PERI0_CTRL_REG) & (1 << 31))
return;
/* Change psi src to osc24m */
val = read32(D1_CCU_BASE + CCU_PSI_CLK_REG);
val &= (~(0x3 << 24));
write32(val, D1_CCU_BASE + CCU_PSI_CLK_REG);
/* Set default val */
write32(D1_CCU_BASE + CCU_PLL_PERI0_CTRL_REG, 0x63 << 8);
/* Lock enable */
val = read32(D1_CCU_BASE + CCU_PLL_PERI0_CTRL_REG);
val |= (1 << 29);
write32(D1_CCU_BASE + CCU_PLL_PERI0_CTRL_REG, val);
/* Enabe pll 600m(1x) 1200m(2x) */
val = read32(D1_CCU_BASE + CCU_PLL_PERI0_CTRL_REG);
val |= (1 << 31);
write32(D1_CCU_BASE + CCU_PLL_PERI0_CTRL_REG, val);
/* Wait pll stable */
while(!(read32(D1_CCU_BASE + CCU_PLL_PERI0_CTRL_REG) & (0x1 << 28)));
sdelay(20);
/* Lock disable */
val = read32(D1_CCU_BASE + CCU_PLL_PERI0_CTRL_REG);
val &= ~(1 << 29);
write32(D1_CCU_BASE + CCU_PLL_PERI0_CTRL_REG, val);
}
static void set_ahb(void)
{
write32(D1_CCU_BASE + CCU_PSI_CLK_REG, (2 << 0) | (0 << 8));
write32(D1_CCU_BASE + CCU_PSI_CLK_REG, read32(D1_CCU_BASE + CCU_PSI_CLK_REG) | (0x03 << 24));
sdelay(1);
}
static void set_apb(void)
{
write32(D1_CCU_BASE + CCU_APB0_CLK_REG, (2 << 0) | (1 << 8));
write32(D1_CCU_BASE + CCU_APB0_CLK_REG, (0x03 << 24) | read32(D1_CCU_BASE + CCU_APB0_CLK_REG));
sdelay(1);
}
static void set_dma(void)
{
/* Dma reset */
write32(D1_CCU_BASE + CCU_DMA_BGR_REG, read32(D1_CCU_BASE + CCU_DMA_BGR_REG) | (1 << 16));
sdelay(20);
/* Enable gating clock for dma */
write32(D1_CCU_BASE + CCU_DMA_BGR_REG, read32(D1_CCU_BASE + CCU_DMA_BGR_REG) | (1 << 0));
}
static void set_mbus(void)
{
uint32_t val;
/* Reset mbus domain */
val = read32(D1_CCU_BASE + CCU_MBUS_CLK_REG);
val |= (0x1 << 30);
write32(D1_CCU_BASE + CCU_MBUS_CLK_REG, val);
sdelay(1);
}
static void set_module(virtual_addr_t addr)
{
uint32_t val;
if(!(read32(addr) & (1 << 31)))
{
val = read32(addr);
write32(addr, val | (1 << 31) | (1 << 30));
/* Lock enable */
val = read32(addr);
val |= (1 << 29);
write32(addr, val);
/* Wait pll stable */
while(!(read32(addr) & (0x1 << 28)));
sdelay(20);
/* Lock disable */
val = read32(addr);
val &= ~(1 << 29);
write32(addr, val);
}
}
void sys_clock_init(void)
{
set_pll_cpux_axi();
set_pll_periph0();
set_ahb();
set_apb();
set_dma();
set_mbus();
set_module(D1_CCU_BASE + CCU_PLL_PERI0_CTRL_REG);
set_module(D1_CCU_BASE + CCU_PLL_VIDEO0_CTRL_REG);
set_module(D1_CCU_BASE + CCU_PLL_VIDEO1_CTRL_REG);
set_module(D1_CCU_BASE + CCU_PLL_VE_CTRL);
set_module(D1_CCU_BASE + CCU_PLL_AUDIO0_CTRL_REG);
set_module(D1_CCU_BASE + CCU_PLL_AUDIO1_CTRL_REG);
}
int main(void)
{
//OS_start(app_init); //启动RTOS并执行主线程app_init
uint64_t t1,t2;
gpio_init(LIGHT_BLUE,GPIO_OUTPUT,LIGHT_ON);
uart_init(UART_User,115200);
LCD_Init();
LCD_Clear(GREEN);
__asm__ __volatile__("csrr %0, time\n" : "=r"(t1) :: "memory");
__asm("nop");
__asm("nop");
__asm("nop");
__asm("nop");
__asm("nop");
__asm("nop");
__asm("nop");
__asm("nop");
__asm("nop");
__asm("nop");
__asm("nop");
__asm("nop");
__asm("nop");
__asm("nop");
__asm("nop");
__asm("nop");
__asm("nop");
__asm("nop");
__asm("nop");
__asm("nop");
__asm("nop");
__asm("nop");
__asm("nop");
__asm("nop");
__asm("nop");
__asm("nop");
__asm("nop");
__asm("nop");
__asm("nop");
__asm("nop");
__asm("nop");
__asm("nop");
__asm("nop");
__asm("nop");
__asm("nop");
__asm("nop");
__asm("nop");
__asm("nop");
__asm("nop");
__asm("nop");
__asm("nop");
__asm("nop");
__asm("nop");
__asm("nop");
__asm("nop");
__asm("nop");
__asm("nop");
__asm("nop");
__asm("nop");
__asm("nop");
__asm__ __volatile__("csrr %0, time\n" : "=r"(t2) :: "memory");
printf("t1:%ld,t2:%ld,\n",t1,t2);
}
延时50个周期,然后通过读取time寄存器的值计算等待50个时钟周期的时间。已知time寄存器的计数频率是24Mhz,每加1用时41.7ns。串口输出结果:t1:118929024,t2:118929029,time寄存器计了5个数,得出结论50个时钟周期用时约200多ns,主频大约为250Mhz,低于配置的1Ghz。
希望请教下我测量运行速度的方式是否有问题?频率配置是否有误?
最近编辑记录 PENPEN (2025-08-27 15:36:23)
离线
后续用__asm__ __volatile__("csrr %0, cycle\n" : "=r"(c1) :: "memory");读取cycle寄存器的时钟周期数,计算100个nop所用的总时钟周期数为476,平局每个nop用了4.76个时钟周期,再计算主频约为1.14G,因此主频配置没错。但每个nop正常应该消耗1个时钟周期吧?为什么这里会消耗4个时钟周期?
离线