Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Survey graph-building APIs from native ecosystem #20

Closed
huningxin opened this issue May 10, 2019 · 5 comments
Closed

Survey graph-building APIs from native ecosystem #20

huningxin opened this issue May 10, 2019 · 5 comments

Comments

@huningxin
Copy link
Contributor

Per resolution on the 9 May 2019 CG call, this issue is for surveying graph-building APIs from native ecosystem which aims to support the discussion in #16. The current foundation spec is direct derivative from Android NNAPI which is a C style API, during the CG call, the participants agreed to survey other graph-building APIs in native ecosystem to learn API design patterns.

There were three APIs mentioned in the CG call.

@walrusmcd mentioned "would love to contribute our learnings from two Microsoft's graph-building APIs". Feel free to add them into the list. Thanks.

@huningxin
Copy link
Contributor Author

huningxin commented May 29, 2019

  • nGraph

The following code snippets use nGraph C++ API to build and execute a computation graph same as one in WebNN example (tensor0 and tensor2 are constants, tensor1 and tensor3 are user inputs):

tensor0 ---+
           +--- ADD ---> intermediateOutput0 ---+
tensor1 ---+                                    |
                                                +--- MUL---> output
tensor2 ---+                                    |
           +--- ADD ---> intermediateOutput1 ---+
tensor3 ---+

The following code builds the graph.

    // Build the graph
    const Shape shape{2, 2, 2, 2};
    const size_t size = shape_size(shape);
    const std::vector<float> constant_data(size, 0.5);

    auto tensor0 = std::make_shared<op::Constant>(element::f32, shape, constant_data);
    auto tensor1 = std::make_shared<op::Parameter>(element::f32, shape);
    auto tensor2 = std::make_shared<op::Constant>(element::f32, shape, constant_data);
    auto tensor3 = std::make_shared<op::Parameter>(element::f32, shape);

    auto add0 = std::make_shared<op::Add>(tensor0, tensor1);
    auto add1 = std::make_shared<op::Add>(tensor2, tensor3);

    auto mul = std::make_shared<op::Multiply>(add0, add1);

    // Make the function for the graph
    // The 1st argument specifies the results/outputs. 
    // The 2nd argument specifies the inputs.
    auto function = std::make_shared<Function>(NodeVector{mul},
                                               ParameterVector{tensor1, tensor3});

The following code compiles the graph.

    // Create the backend and compile the function
    auto backend = runtime::Backend::create("CPU");
    auto exec = backend->compile(function);

The following code executes the compiled graph.

    // Allocate tensors for inputs
    auto input0 = backend->create_tensor(element::f32, shape);
    auto input1 = backend->create_tensor(element::f32, shape);

    // Allocate tensor for output
    auto output = backend->create_tensor(element::f32, shape);

    // Initialize the input tensors
    const std::vector<float> input_data0(size, 1), input_data1(size, 2);
    input0->write(input_data0.data(), 0, sizeof(float)*input_data0.size());
    input1->write(input_data1.data(), 0, sizeof(float)*input_data1.size());

    // Invoke the function
    exec->call({output}, {input0, input1});

    // Get the result
    std::vector<float> output_data(size);
    output->read(output_data.data(), 0, sizeof(float)*output_data.size());

The complete example is hosted here.

@huningxin
Copy link
Contributor Author

  • MPSNNGraph

My colleague @fujunwei helped create an example with MPSNNGraph API.

The following code builds the graph.

  // Build the graph.
  const std::vector<int> shape = {2, 2, 2, 2};
  size_t length = 16;
  const std::vector<__fp16> constant_data(length, 0.5);
  id<MTLDevice> device = MTLCreateSystemDefaultDevice();
  id<MTLCommandBuffer> command_buffer = [[device newCommandQueue] commandBuffer];

  MPSImage* constant0 = CreateMPSImageWithData(device, constant_data, shape);
  MPSNNImageNode* tensor0 = [MPSNNImageNode nodeWithHandle:[[MPSImageHandle alloc]
                                                            initWithImage:constant0]];
  MPSImage* input0 = CreateMPSImage(device, shape);
  MPSNNImageNode* tensor1 = [MPSNNImageNode nodeWithHandle:[[MPSImageHandle alloc]
                                                            initWithImage:input0]];
  MPSImage* constant1 = CreateMPSImageWithData(device, constant_data, shape);
  MPSNNImageNode* tensor2 = [MPSNNImageNode nodeWithHandle:[[MPSImageHandle alloc]
                                                            initWithImage:constant1]];
  MPSImage* input1 = CreateMPSImage(device, shape);
  MPSNNImageNode* tensor3 = [MPSNNImageNode nodeWithHandle:[[MPSImageHandle alloc]
                                                            initWithImage:input1]];
  MPSNNAdditionNode* add_0 = [MPSNNAdditionNode nodeWithLeftSource:tensor0
                                                       rightSource:tensor1];
  MPSNNAdditionNode* add_1 = [MPSNNAdditionNode nodeWithLeftSource:tensor2
                                                       rightSource:tensor3];
  MPSNNMultiplicationNode* mul = [MPSNNMultiplicationNode nodeWithLeftSource:add_0.resultImage
                                                                 rightSource:add_1.resultImage];
  MPSNNGraph* graph = [MPSNNGraph graphWithDevice:device
                                      resultImage:mul.resultImage
                              resultImageIsNeeded:true];

The following code executes the graph with input data and gets output data.

  // Execute the graph.
  NSMutableArray<MPSImage*>* image_array = [NSMutableArray arrayWithCapacity:1];
  const std::vector<__fp16> input_data0(length, 1);
  const std::vector<__fp16> input_data1(length, 2);
  UploadDataToMPSImage(input0, input_data0);
  UploadDataToMPSImage(input1, input_data1);
  NSArray<MPSImageHandle*> * handles = graph.sourceImageHandles;
  for (size_t i = 0; i < handles.count; ++i) {
    [image_array addObject:handles[i].image];
  }
  MPSImage* output_image = [graph encodeToCommandBuffer:command_buffer
                                           sourceImages:image_array
                                           sourceStates:nullptr
                                           intermediateImages:nullptr
                                           destinationStates:nullptr];

  // Get output data.
  size_t size = length * sizeof(__fp16);
  id<MTLBuffer> output_buffer = CreateOutputBuffer(device, command_buffer, output_image, size);

  [command_buffer commit];
  [command_buffer waitUntilCompleted];
  std::vector<__fp16> output_data(length);
  memcpy(output_data.data(), [output_buffer contents], size);
  std::cout << "[";
  for (size_t i = 0; i < length; ++i) {
    std::cout << output_data[i] << ' ';
  }
  std::cout << ']' << std::endl;

The complete example is hosted here.

@walrusmcd
Copy link

Thanks @huningxin !

Here are two examples we would love to share:

  1. The ONNX Runtime

ONNX is an OSS standards body that drives opsets and schema for cross framework interchange in ML.
GitHub here.

Microsoft has also contributed a ML Runtime that works with ONNX. This is called the ONNX runtime (ORT). This is a cross platform, high performance ML engine that works with multiple forms of hardware acceleration. We ship this runtime built into Windows starting with version 1809 (October 2018 Update). GitHub here

Execution providers are the abstraction layer between the runtime, and the provider that supplies operator kernels (implementations).

Graphs are used to work with execution providers.

This is the mechanism that the provider can use to participate in graph optimization and rewriting.

Notice that the pattern here is to provider a full graph representation. With Nodes and Edges.

There is a lot of power that comes from this implementation around allowing for things like:

  • SubGraphs
  • Mutable graphs
  • Graph rewriting
  • Readonly views of the graphs
  • Edges and ControlEdges (where a control edge defines a flow dependency, but not an input/output dependency)

This is the mechanism for the provider all up.

The currency between the runtime and the provider are these IndexedSubGraphs and their Nodes.

The runtime will call IExecutionProvider:Compile() on the subgraph that it says it can handle. This then allows the provider to build the kernels and compute functions for those Nodes (ops).

This manner allows multiple providers to all participate in the execution of the Graph. Or , a single provider can also handle the entire graph (in our case we have both CPU and DirectX GPU providers that can handle entire graphs). This is largely driven by our opset schema. This give us great flexibility in that new ops can appear, and all providers don't have to be updated as long as there exists one provider that can handle the new op.

This is how we tend to do our GPU and hardware accelerated work, that allows for fallback to CPU.

We have reference implementations for many providers, including some that work with graphs (ngraph, tensorRT, etc.) here.

  1. Direct ML

New in Windows is a hardware abstraction layer designed for ML. This is the DirectX ML system, or DirectML. MSDN here.

Starting in version 1903 (May 2019 Update) DirectML has an operator level interface. It has a pattern that works well for hardware optimization by breaking work into 2 stages: (1) Initialization (2) Execution.

DirectML also participates underneath the ONNX Runtime. Using WDDM and MCDM Windows allows IHV vendors to supply drivers that work with DirectML for accelerating ML workloads. This allows anyone to participate across multiple GPU and TPU/VPU vendors at the operator kernel level.

We are also working on future innovation around how to have those higher level Graph and IndexedSubGraph interactions work going forward with the ORT.

@huningxin
Copy link
Contributor Author

huningxin commented Jun 12, 2019

  • TensorRT

To complete the list, here comes an example with TensorRT API.

The following code builds the graph.

    // Build the graph
    IBuilder* builder = createInferBuilder(logger);
    INetworkDefinition* network = builder->createNetwork();

    Dims4 dims({2, 2, 2, 2});
    const size_t length = 16;
    std::vector<float> buffer(length, 0.5);
    const Weights constant{DataType::kFLOAT, buffer.data(), length};
    ITensor* tensor0 = network->addInput("tensor0", DataType::kFLOAT, dims);
    IConstantLayer* constant1 = network->addConstant(dims, constant);
    ITensor* tensor2 = network->addInput("tensor2", DataType::kFLOAT, dims);
    IConstantLayer* constant3 = network->addConstant(dims, constant);
    IElementWiseLayer* add0 = network->addElementWise(
        *tensor0, *constant1->getOutput(0), ElementWiseOperation::kSUM);
    IElementWiseLayer* add1 = network->addElementWise(
        *tensor2, *constant3->getOutput(0), ElementWiseOperation::kSUM);
    IElementWiseLayer* mul = network->addElementWise(
        *add0->getOutput(0), *add1->getOutput(0), ElementWiseOperation::kPROD);
    mul->getOutput(0)->setName("output");
    network->markOutput(*mul->getOutput(0));  

The following code executes the graph with input data and gets output data.

    // Execute the graph
    ICudaEngine* engine = builder->buildCudaEngine(*network);
    IExecutionContext* context = engine->createExecutionContext();
    const int input0Index = engine->getBindingIndex("tensor0");
    const int input1Index = engine->getBindingIndex("tensor2");
    const int outputIndex = engine->getBindingIndex("output");
    
    void* buffers[3];
    cudaMalloc(&buffers[input0Index], length * sizeof(float));
    cudaMalloc(&buffers[input1Index], length * sizeof(float));
    cudaMalloc(&buffers[outputIndex], length * sizeof(float));

    cudaStream_t stream;
    cudaStreamCreate(&stream);

    std::vector<float> input0Data(length, 1);
    std::vector<float> input1Data(length, 2);
    cudaMemcpyAsync(buffers[input0Index], input0Data.data(),
                    input0Data.size() * sizeof(float),
                    cudaMemcpyHostToDevice, stream);
    cudaMemcpyAsync(buffers[input1Index], input1Data.data(),
                    input1Data.size() * sizeof(float),
                    cudaMemcpyHostToDevice, stream);
    context->enqueue(1, buffers, stream, nullptr);
    float output[length];
    cudaMemcpyAsync(output, buffers[outputIndex],
                    length * sizeof(float), cudaMemcpyDeviceToHost, stream);
    cudaStreamSynchronize(stream);

    // Print output
    std::cout << "output: [";
    for (unsigned int i = 0; i < length; i++)
    {
        std::cout << output[i] << " ";
    }
    std::cout << "]" << std::endl;

@huningxin
Copy link
Contributor Author

As #22 has been merged, close this one.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants