因为在做3*3卷积的时候,图像大小会变小,具体计算公式如下
其中O是输出特征图的大小,I是输入特征图的大小,P是Padding的大小,K是卷积核的大小,S是指Stride的大小,当K的值是3,P的值是1,S的值也是1,的时候O的值和I的值相等。
为了保持输出图像的大小在经过卷积后和输入的大小一样,我们需要进行Padding操作,在这里我采用了复制周围一圈的方式来完成。
import cv2 as cv import numpy as np img = cv.imread(r"G:shiyanIDc.jpg") img_gray = cv.cvtColor(img, cv.COLOR_RGB2GRAY) h, w = img_gray.shape img_padding = np.zeros((h + 2, w + 2), np.uint8) img_padding[1:h + 1, 1:w + 1] = img_gray img_padding[0:1, 1:w + 1] = img_gray[0:1, :] img_padding[h + 1:h + 2, 1:w + 1] = img_gray[h - 1:h, :] img_padding[:, 0:1] = img_padding[:, 1:2] img_padding[:, w + 1:w + 2] = img_padding[:, w:w + 1] th = 200 sobel_rf = np.zeros((h, w), np.uint8) for i in range(1, h): for j in range(1, w): gx1 = img_padding[i - 1][j + 1] + 2 * img_padding[i][j + 1] + img_padding[i + 1][j + 1] gx2 = img_padding[i - 1][j - 1] + 2 * img_padding[i][j - 1] + img_padding[i + 1][j - 1] gy1 = img_padding[i - 1][j - 1] + 2 * img_padding[i - 1][j] + img_padding[i - 1][j + 1] gy2 = img_padding[i + 1][j - 1] + 2 * img_padding[i + 1][j] + img_padding[i + 1][j + 1] gx = abs(gx1 - gx2) gy = abs(gy1 - gy2) if gx + gy > th: sobel_rf[i - 1][j - 1] = 255 else: sobel_rf[i - 1][j - 1] = 0 cv.imshow("sobel_rf", sobel_rf) cv.imshow("src", img_gray) cv.waitKey() cv.destroyAllWindows()
根据算法模型完成HDL:提供SpinalHDL源码
import spinal.core._ import spinal.lib._ class Sobel(th: Int, imageColNum: Int, imageRowNum: Int) extends Component { val io = new Bundle { val dataIn = slave(ImageStream(8, imageColNum, imageRowNum, 1)) val dataOut = master(ImageStream(8, imageColNum, imageRowNum, 1)) } noIoPrefix() val genMatrix = new GenMatrix(scala.math.pow(2, log2Up(imageColNum)).toInt, imageColNum, imageRowNum) genMatrix.io.dataIn <> io.dataIn val genMatrixOut = ImageStream(8, imageColNum, imageRowNum, 9) genMatrixOut := genMatrix.io.dataOut val GX1 = RegNext(genMatrixOut.data(0).asUInt +^ (genMatrixOut.data(1) ## B"1'b0").asUInt +^ genMatrixOut.data(2).asUInt) val GX2 = RegNext(genMatrixOut.data(6).asUInt +^ (genMatrixOut.data(7) ## B"1'b0").asUInt +^ genMatrixOut.data(8).asUInt) val GX = Reg(UInt(11 bits)) val GY1 = RegNext(genMatrixOut.data(6).asUInt +^ (genMatrixOut.data(3) ## B"1'b0").asUInt +^ genMatrixOut.data(0).asUInt) val GY2 = RegNext(genMatrixOut.data(8).asUInt +^ (genMatrixOut.data(5) ## B"1'b0").asUInt +^ genMatrixOut.data(2).asUInt) val GY = Reg(UInt(11 bits)) when(GX1 > GX2) { GX := GX1 - GX2 } otherwise { GX := GX2 - GX1 } when(GY1 > GY2) { GY := GY1 - GY2 } otherwise { GY := GY2 - GY1 } val G = RegNext(GX + GY) val sobelOut = Reg(Bits(8 bits)) when(G > th) { sobelOut := 255 } otherwise { sobelOut := 0 } io.dataOut.data(0) := sobelOut io.dataOut.row := Delay(genMatrixOut.row, 4) io.dataOut.col := Delay(genMatrixOut.col, 4) io.dataOut.c.hsync := Delay(genMatrixOut.c.hsync, 4,init = False) io.dataOut.c.vsync := Delay(genMatrixOut.c.vsync, 4,init = False) io.dataOut.c.de := Delay(genMatrixOut.c.de, 4,init = False) } object Sobel extends App { SpinalConfig().generateVerilog(new Sobel(200, 640, 480)) }
仿真代码:
import spinal.lib._ import spinal.core._ import spinal.core.sim._ import scala.collection.mutable.Queue import java.io.FileOutputStream import scala.io.Source class tbSobelC(th: Int) extends Sobel(th, 430, 430) { var src = Array[String]() var destDut = Array[String]() var destRef = Array[String]() // var srcLen = 0 var width = Array[Int]() var high = Array[Int]() val dutData = Queue[Int]() val refData = Queue[Int]() var frameLen = 0 def init(srcFile: Array[String], destDutFile: Array[String], destRefFile: Array[String], imgShape: Array[(Int, Int)]) = { clockDomain.forkStimulus(10) io.dataIn.data(0) #= 0 io.dataIn.row #= 0 io.dataIn.col #= 0 src = srcFile destDut = destDutFile destRef = destRefFile io.dataIn.c.de #= false io.dataIn.c.vsync #= false io.dataIn.c.hsync #= false frameLen = src.length width = imgShape.map(i => i._1) high = imgShape.map(i => i._2) clockDomain.waitSampling(10) } def frame(src: String, width: Int, high: Int) = { val srcFile = Source.fromFile(src) val srcData = srcFile.getLines() var colCnt = 0 var rowCnt = 0 io.dataIn.row #= width io.dataIn.col #= high io.dataIn.c.de #= false io.dataIn.c.vsync #= false io.dataIn.c.hsync #= false clockDomain.waitSampling(20) while (srcData.hasNext) { val data = srcData.next() io.dataIn.data(0) #= data.toInt io.dataIn.c.de #= true if (colCnt == 0 && rowCnt == 0) { io.dataIn.c.vsync #= true println("xx") } else { io.dataIn.c.vsync #= false } if (colCnt == width - 1 && rowCnt == high - 1) { clockDomain.waitSampling(1) io.dataIn.c.de #= false clockDomain.waitSampling(200) } if (colCnt == 0) { io.dataIn.c.hsync #= true } else { io.dataIn.c.hsync #= false } if (colCnt == width - 1 && rowCnt != high - 1) { clockDomain.waitSampling(1) io.dataIn.c.de #= false clockDomain.waitSampling(20) } if (colCnt == width - 1) { colCnt = 0 if (rowCnt == high - 1) { rowCnt = 0 } else { rowCnt = rowCnt + 1 } } else { colCnt = colCnt + 1 } clockDomain.waitSampling() } clockDomain.waitSampling(1000) srcFile.close() } def driver = { val dri = fork { for (i <- 0 until frameLen) { println(s"frame = ${i}") frame(src(i), width(i), high(i)) } } } def dutOut = { val dutOutFile = new FileOutputStream(destDut(0)) val d = fork { while (true) { if (io.dataOut.c.de.toBoolean) { dutData.enqueue(io.dataOut.data(0).toInt) dutOutFile.write((io.dataOut.data(0).toInt.toString + " ").getBytes()) } clockDomain.waitSampling() } } } def refFun = { val d = fork { while (true) { for (i <- 0 until frameLen) { val file = Source.fromFile(destRef(i)) val srcData = file.getLines() while (srcData.hasNext) { clockDomain.waitSampling() val data = srcData.next().toInt refData.enqueue(data) } } } } } def scoreBoard = { val d = fork { var index = 0 while (true) { while (dutData.nonEmpty && refData.nonEmpty) { clockDomain.waitSampling() val dut = dutData.dequeue() val ref = refData.dequeue() // if(dut != ref){ // println(s"i:${index} dutData:${dut} refData:${ref}") // } index = index + 1 assert(scala.math.abs(ref - dut) < 5, s"index:${index}, dutData:${dut} refData:${ref}") // if (scala.math.abs(ref - dut) != 0) { // println(s"ref = ${ref} , dut = ${dut}") // } } clockDomain.waitSampling() } } } def waitSimDone = { val d = fork { var index = 0 while (index < width(0) * high(0)) { clockDomain.waitSampling() if (io.dataOut.c.de.toBoolean) { index = index + 1 } } clockDomain.waitSampling(3000) simSuccess() }.join() } } class tbSobel { val testFile = Array("testGray.txt") val dutFile = Array("testDut.txt") val refFile = Array("testSobel.txt") val imgShape = Array((430, 430)) val dut = SimConfig.withConfig(SpinalConfig(inlineRom = true)).withWave.compile(new tbSobelC(100)) dut.doSim { dut => dut.init(testFile, dutFile, refFile, imgShape) dut.driver dut.refFun dut.dutOut dut.scoreBoard dut.waitSimDone } } object tbSobel extends App { val tb = new tbSobel }
经过分析之后,该代码可以跑到238MHz,占用330LUT,312FF。
审核编辑:刘清
评论
查看更多