Вычислить сумму значений массива параллельно с металлом Swift

Question

Вычислить сумму значений массива параллельно с металлом Swift

Я пытаюсь вычислить сумму большого массива параллельно с металлом Swift.

Есть ли бог способ сделать это?

Моя плоскость заключалась в том, что я делил свой массив на вложенные массивы, вычислял сумму одного вложенного массива параллельно, а затем, когда параллельные вычисления заканчивались, вычислял сумму вложенных сумм.

например, если у меня есть

array = [a0,....an]

Я делю массив на подмассивы:

array_1 = [a_0,...a_i],
array_2 = [a_i+1,...a_2i],
....
array_n/i = [a_n-1, ... a_n]

суммы для этого массива вычисляется параллельно, и я получаю

sum_1, sum_2, sum_3, ... sum_n/1

в конце просто вычислите сумму сумм.

Я создаю приложение, которое запускает мой металлический шейдер, но некоторые вещи я не совсем понимаю.

        var array:[[Float]] = [[1,2,3], [4,5,6], [7,8,9]]

        // get device
        let device: MTLDevice! = MTLCreateSystemDefaultDevice()

        // get library
        let defaultLibrary:MTLLibrary! = device.newDefaultLibrary()

        // queue
        let commandQueue:MTLCommandQueue! = device.newCommandQueue()

        // function
        let kernerFunction: MTLFunction! = defaultLibrary.newFunctionWithName("calculateSum")

        // pipeline with function
        let pipelineState: MTLComputePipelineState! = try device.newComputePipelineStateWithFunction(kernerFunction)

        // buffer for function
        let commandBuffer:MTLCommandBuffer! = commandQueue.commandBuffer()

        // encode function
        let commandEncoder:MTLComputeCommandEncoder = commandBuffer.computeCommandEncoder()

        // add function to encode
        commandEncoder.setComputePipelineState(pipelineState)

        // options
        let resourceOption = MTLResourceOptions()

        let arrayBiteLength = array.count * array[0].count * sizeofValue(array[0][0])

        let arrayBuffer = device.newBufferWithBytes(&array, length: arrayBiteLength, options: resourceOption)

        commandEncoder.setBuffer(arrayBuffer, offset: 0, atIndex: 0)

        var result:[Float] = [0,0,0]

        let resultBiteLenght = sizeofValue(result[0])

        let resultBuffer = device.newBufferWithBytes(&result, length: resultBiteLenght, options: resourceOption)

        commandEncoder.setBuffer(resultBuffer, offset: 0, atIndex: 1)

        let threadGroupSize = MTLSize(width: 1, height: 1, depth: 1)

        let threadGroups = MTLSize(width: (array.count), height: 1, depth: 1)

        commandEncoder.dispatchThreadgroups(threadGroups, threadsPerThreadgroup: threadGroupSize)

        commandEncoder.endEncoding()

        commandBuffer.commit()

        commandBuffer.waitUntilCompleted()

        let data = NSData(bytesNoCopy: resultBuffer.contents(), length: sizeof(Float), freeWhenDone: false)

        data.getBytes(&result, length: result.count * sizeof(Float))

        print(result)

мой код Swift,

мой шейдер это:

kernel void calculateSum(const device float *inFloat [[buffer(0)]],
                     device float *result [[buffer(1)]],
                     uint id [[ thread_position_in_grid ]]) {


    float * f = inFloat[id];
    float sum = 0;
    for (int i = 0 ; i < 3 ; ++i) {
        sum = sum + f[i];
    }

    result = sum;
}

Я не знаю, как определить, что inFloat является массивом массивов. Я не знаю точно, что такое threadGroupSize и threadGroups. Я не знаю, что такое device и uint в свойствах шейдера.

Это правильный подход?

12

swift sum shader metal

Источник

user2791346 02 июл '16 в 21:29

1 ответ

Решение

В принятом ответе досадно отсутствует ядро, которое было написано для него. Исходный код здесь, но вот полная программа и шейдер, которые можно запустить как быстрое приложение командной строки.

/*
 * Command line Metal Compute Shader for data processing
 */

import Metal
import Foundation
//------------------------------------------------------------------------------
let count = 10_000_000
let elementsPerSum = 10_000
//------------------------------------------------------------------------------
typealias DataType = CInt // Data type, has to be the same as in the shader
//------------------------------------------------------------------------------
let device = MTLCreateSystemDefaultDevice()!
let library = device.makeDefaultLibrary()!
let parsum = library.makeFunction(name: "parsum")!
let pipeline = try! device.makeComputePipelineState(function: parsum)
//------------------------------------------------------------------------------
// Our data, randomly generated:
var data = (0..<count).map{ _ in DataType(arc4random_uniform(100)) }
var dataCount = CUnsignedInt(count)
var elementsPerSumC = CUnsignedInt(elementsPerSum)
// Number of individual results = count / elementsPerSum (rounded up):
let resultsCount = (count + elementsPerSum - 1) / elementsPerSum
//------------------------------------------------------------------------------
// Our data in a buffer (copied):
let dataBuffer = device.makeBuffer(bytes: &data, length: MemoryLayout<DataType>.stride * count, options: [])!
// A buffer for individual results (zero initialized)
let resultsBuffer = device.makeBuffer(length: MemoryLayout<DataType>.stride * resultsCount, options: [])!
// Our results in convenient form to compute the actual result later:
let pointer = resultsBuffer.contents().bindMemory(to: DataType.self, capacity: resultsCount)
let results = UnsafeBufferPointer<DataType>(start: pointer, count: resultsCount)
//------------------------------------------------------------------------------
let queue = device.makeCommandQueue()!
let cmds = queue.makeCommandBuffer()!
let encoder = cmds.makeComputeCommandEncoder()!
//------------------------------------------------------------------------------
encoder.setComputePipelineState(pipeline)
encoder.setBuffer(dataBuffer, offset: 0, index: 0)
encoder.setBytes(&dataCount, length: MemoryLayout<CUnsignedInt>.size, index: 1)
encoder.setBuffer(resultsBuffer, offset: 0, index: 2)
encoder.setBytes(&elementsPerSumC, length: MemoryLayout<CUnsignedInt>.size, index: 3)
//------------------------------------------------------------------------------
// We have to calculate the sum `resultCount` times => amount of threadgroups is `resultsCount` / `threadExecutionWidth` (rounded up) because each threadgroup will process `threadExecutionWidth` threads
let threadgroupsPerGrid = MTLSize(width: (resultsCount + pipeline.threadExecutionWidth - 1) / pipeline.threadExecutionWidth, height: 1, depth: 1)

// Here we set that each threadgroup should process `threadExecutionWidth` threads, the only important thing for performance is that this number is a multiple of `threadExecutionWidth` (here 1 times)
let threadsPerThreadgroup = MTLSize(width: pipeline.threadExecutionWidth, height: 1, depth: 1)
//------------------------------------------------------------------------------
encoder.dispatchThreadgroups(threadgroupsPerGrid, threadsPerThreadgroup: threadsPerThreadgroup)
encoder.endEncoding()
//------------------------------------------------------------------------------
var start, end : UInt64
var result : DataType = 0
//------------------------------------------------------------------------------
start = mach_absolute_time()
cmds.commit()
cmds.waitUntilCompleted()
for elem in results {
    result += elem
}

end = mach_absolute_time()
//------------------------------------------------------------------------------
print("Metal result: \(result), time: \(Double(end - start) / Double(NSEC_PER_SEC))")
//------------------------------------------------------------------------------
result = 0

start = mach_absolute_time()
data.withUnsafeBufferPointer { buffer in
    for elem in buffer {
        result += elem
    }
}
end = mach_absolute_time()

print("CPU result: \(result), time: \(Double(end - start) / Double(NSEC_PER_SEC))")
//------------------------------------------------------------------------------

#include <metal_stdlib>
using namespace metal;

typedef unsigned int uint;
typedef int DataType;

kernel void parsum(const device DataType* data [[ buffer(0) ]],
                   const device uint& dataLength [[ buffer(1) ]],
                   device DataType* sums [[ buffer(2) ]],
                   const device uint& elementsPerSum [[ buffer(3) ]],

                   const uint tgPos [[ threadgroup_position_in_grid ]],
                   const uint tPerTg [[ threads_per_threadgroup ]],
                   const uint tPos [[ thread_position_in_threadgroup ]]) {

    uint resultIndex = tgPos * tPerTg + tPos;

    uint dataIndex = resultIndex * elementsPerSum; // Where the summation should begin
    uint endIndex = dataIndex + elementsPerSum < dataLength ? dataIndex + elementsPerSum : dataLength; // The index where summation should end

    for (; dataIndex < endIndex; dataIndex++)
        sums[resultIndex] += data[dataIndex];
}

7

Источник

user8876321 23 авг '19 в 13:54

Я запустил приложение. на gt 740 (384 ядра) против i7-4790 с многопоточной реализацией векторной суммы, и вот мои цифры:

Metal lap time: 19.959092
cpu MT lap time: 4.353881

для процессора это соотношение 5/1, поэтому если у вас нет мощного графического процессора, использование шейдеров того не стоит.

я тестировал тот же код в i7-3610qm с igpu intel hd 4000, и неожиданно результаты стали намного лучше для металла: 2/1

отредактировано: после настройки с параметром потока я, наконец, улучшил производительность графического процессора, теперь он до 16xcpu

-3

Источник

user1578968 29 мар '19 в 14:54

Другие вопросы по тегам swift sum shader metal

user3443689 03 июл '16 в 00:47 2016-07-03 00:47 · Accepted Answer · 2016-07-03 00:47

Я нашел время, чтобы создать полностью рабочий пример этой проблемы с металлом. Объяснение в комментариях:

import Metal

let count = 10_000_000
let elementsPerSum = 10_000

// Data type, has to be the same as in the shader
typealias DataType = CInt

let device = MTLCreateSystemDefaultDevice()!
let parsum = device.newDefaultLibrary()!.newFunctionWithName("parsum")!
let pipeline = try! device.newComputePipelineStateWithFunction(parsum)

var data = (0..<count).map{ _ in DataType(arc4random_uniform(100)) } // Our data, randomly generated
var dataCount = CUnsignedInt(count)
var elementsPerSumC = CUnsignedInt(elementsPerSum)
let resultsCount = (count + elementsPerSum - 1) / elementsPerSum // Number of individual results = count / elementsPerSum (rounded up)

let dataBuffer = device.newBufferWithBytes(&data, length: strideof(DataType) * count, options: []) // Our data in a buffer (copied)
let resultsBuffer = device.newBufferWithLength(strideof(DataType) * resultsCount, options: []) // A buffer for individual results (zero initialized)
let results = UnsafeBufferPointer<DataType>(start: UnsafePointer(resultsBuffer.contents()), count: resultsCount) // Our results in convenient form to compute the actual result later

let queue = device.newCommandQueue()
let cmds = queue.commandBuffer()
let encoder = cmds.computeCommandEncoder()

encoder.setComputePipelineState(pipeline)

encoder.setBuffer(dataBuffer, offset: 0, atIndex: 0)
encoder.setBytes(&dataCount, length: sizeofValue(dataCount), atIndex: 1)
encoder.setBuffer(resultsBuffer, offset: 0, atIndex: 2)
encoder.setBytes(&elementsPerSumC, length: sizeofValue(elementsPerSumC), atIndex: 3)

// We have to calculate the sum `resultCount` times => amount of threadgroups is `resultsCount` / `threadExecutionWidth` (rounded up) because each threadgroup will process `threadExecutionWidth` threads
let threadgroupsPerGrid = MTLSize(width: (resultsCount + pipeline.threadExecutionWidth - 1) / pipeline.threadExecutionWidth, height: 1, depth: 1)

// Here we set that each threadgroup should process `threadExecutionWidth` threads, the only important thing for performance is that this number is a multiple of `threadExecutionWidth` (here 1 times)
let threadsPerThreadgroup = MTLSize(width: pipeline.threadExecutionWidth, height: 1, depth: 1)

encoder.dispatchThreadgroups(threadgroupsPerGrid, threadsPerThreadgroup: threadsPerThreadgroup)
encoder.endEncoding()

var start, end : UInt64
var result : DataType = 0

start = mach_absolute_time()
cmds.commit()
cmds.waitUntilCompleted()
for elem in results {
    result += elem
}

end = mach_absolute_time()

print("Metal result: \(result), time: \(Double(end - start) / Double(NSEC_PER_SEC))")
result = 0

start = mach_absolute_time()
data.withUnsafeBufferPointer { buffer in
    for elem in buffer {
        result += elem
    }
}
end = mach_absolute_time()

print("CPU result: \(result), time: \(Double(end - start) / Double(NSEC_PER_SEC))")

Shader:

// Data type, has to be the same as in the Swift file
typedef int DataType;

kernel void parsum(const device DataType* data [[ buffer(0) ]],
                   const device uint& dataLength [[ buffer(1) ]],
                   device DataType* sums [[ buffer(2) ]],
                   const device uint& elementsPerSum [[ buffer(3) ]],

                   const uint tgPos [[ threadgroup_position_in_grid ]],
                   const uint tPerTg [[ threads_per_threadgroup ]],
                   const uint tPos [[ thread_position_in_threadgroup ]]) {
    uint resultIndex = tgPos * tPerTg + tPos; // This is the index of the individual result, this var is unique to this thread
    uint dataIndex = resultIndex * elementsPerSum; // Where the summation should begin
    uint endIndex = dataIndex + elementsPerSum < dataLength ? dataIndex + elementsPerSum : dataLength; // The index where summation should end

    for (; dataIndex < endIndex; dataIndex++)
        sums[resultIndex] += data[dataIndex];
}

Я использовал свой Mac, чтобы протестировать его, но на iOS он должен нормально работать.

Выход:

Metal result: 494936505, time: 0.024611456
CPU result: 494936505, time: 0.163341018

Металлическая версия примерно в 7 раз быстрее. Я уверен, что вы можете получить больше скорости, если реализуете что-то вроде "разделяй и властвуй" с отсечкой или чем-то еще.