Old-style GPGPU reduction, average pixel color

Alec Jacobson

April 28, 2015

weblog/

Here's a little demo which computes the average pixel value of an OpenGL rendering. As a sanity check I compute the average value on the cpu-side using glReadPixels and then compare to computing the average value using a "mip-map" style, ping-pong, texture-buffer GPGPU reduction. Finally I render the buffers for fun.

I'm using yimg just to read in a .png file to render as an example.

On my weak little laptop, the GPU code is about 30 times faster. Can't shrug at that! Rookie mistake. Made timings in debug mode. In release there's hardly a speed up : - (

I was careful to compute the average in a coherent way (the images get progressively blurred out, rather than averaging the image quadrants recursively). This would be useful if, say, computing the average of 100 renderings. You could render them into a 10s x 10s texture and run the GPU reduction until the result is just a 10x10 image containing the 100 results. That'd only require a single final call to glReadPixels (rather than calling glReadPixels to read the final single pixel result of each reduction).

#include <YImage.hpp>
#include <YImage.cpp>
#include <GLUT/glut.h>
#include <iostream>
#include <string>
#include <iomanip>

// Size of image rounded up to next power of 2
size_t s;
// Shader program for doing reduction
GLuint reduction_prog_id;
// Need two textures and buffers to ping-pong
GLuint tex_id[] = {0,0};
GLuint fbo_id[] = {0,0};
GLuint dfbo_id[] = {0,0};
// Image (something to render in the first place)
std::string filename("hockney-512.png");
YImage yimg;

void init_render_to_texture(
  const size_t width,
  const size_t height,
  GLuint & tex_id,
  GLuint & fbo_id,
  GLuint & dfbo_id)
{
  using namespace std;
  // Delete if already exists
  glDeleteTextures(1,&tex_id);
  glDeleteFramebuffersEXT(1,&fbo_id);
  glDeleteFramebuffersEXT(1,&dfbo_id);
  // http://www.opengl.org/wiki/Framebuffer_Object_Examples#Quick_example.2C_render_to_texture_.282D.29
  glGenTextures(1, &tex_id);
  glBindTexture(GL_TEXTURE_2D, tex_id);
  glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP);
  glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP);
  glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
  glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
  //NULL means reserve texture memory, but texels are undefined
  glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA32F_ARB, width, height, 0, GL_BGRA, GL_FLOAT, NULL);
  glBindTexture(GL_TEXTURE_2D, 0);
  glGenFramebuffersEXT(1, &fbo_id);
  glBindFramebufferEXT(GL_FRAMEBUFFER_EXT, fbo_id);
  //Attach 2D texture to this FBO
  glFramebufferTexture2DEXT(GL_FRAMEBUFFER_EXT, GL_COLOR_ATTACHMENT0_EXT, GL_TEXTURE_2D, tex_id, 0);
  glGenRenderbuffersEXT(1, &dfbo_id);
  glBindRenderbufferEXT(GL_RENDERBUFFER_EXT, dfbo_id);
  glRenderbufferStorageEXT(GL_RENDERBUFFER_EXT, GL_DEPTH_COMPONENT24, width, height);
  //Attach depth buffer to FBO (for this example it's not really needed, but if
  //drawing a 3D scene it would be necessary to attach something)
  glFramebufferRenderbufferEXT(GL_FRAMEBUFFER_EXT, GL_DEPTH_ATTACHMENT_EXT, GL_RENDERBUFFER_EXT, dfbo_id);
  //Does the GPU support current FBO configuration?
  GLenum status;
  status = glCheckFramebufferStatusEXT(GL_FRAMEBUFFER_EXT);
  assert(status == GL_FRAMEBUFFER_COMPLETE_EXT);
  // Unbind to clean up
  glBindRenderbufferEXT(GL_RENDERBUFFER_EXT, 0);
  glBindFramebufferEXT(GL_FRAMEBUFFER_EXT, 0);
}

int main(int argc, char * argv[])
{
  glutInit(&argc,argv);
  if(argc>1)
  {
    filename = argv[1];
  }
  yimg.load(filename.c_str());
  s = std::max(yimg.width(),yimg.height());
  // http://stackoverflow.com/a/466278/148668
  s--;
  s |= s >> 1;
  s |= s >> 2;
  s |= s >> 4;
  s |= s >> 8;
  s |= s >> 16;
  s++;

  glutInitDisplayString("rgba depth double stencil");
  glutInitWindowSize(2*s,s);
  glutCreateWindow("texture-reduction");
  glutDisplayFunc(
    []()
    {
      using namespace std;
      // Initialize **both** buffers and set to render into first
      for(int i = 1;i>=0;i--)
      {
        glBindFramebufferEXT(GL_FRAMEBUFFER_EXT, fbo_id[i]);
        glBindRenderbufferEXT(GL_RENDERBUFFER_EXT, dfbo_id[i]);
        glClearColor(0,0,0,1);
        glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);
      }

      // Render something
      yimg.flip();
      glMatrixMode(GL_PROJECTION);
      glPushMatrix();
      glLoadIdentity();
      glOrtho(0,s,0,s, -10000,10000);
      glViewport(0,0,s,s);
      glRasterPos2f(0,0);
      glDrawPixels(yimg.width(), yimg.height(), GL_RGBA, GL_UNSIGNED_BYTE, yimg.data());
      glPopMatrix();

      // Even the cpu code should use a buffer (rather than the screen)
      GLfloat * rgb = new GLfloat[yimg.width() * yimg.height() * 3];
      glReadPixels(0, 0, yimg.width(), yimg.height(), GL_RGB, GL_FLOAT, rgb);
      // Gather into double: sequential add is prone to numerical error
      double avg[] = {0,0,0};
      for(size_t i = 0;i<yimg.width()*yimg.height()*3;i+=3)
      {
        avg[0] += rgb[i + 0];
        avg[1] += rgb[i + 1];
        avg[2] += rgb[i + 2];
      }
      for_each(avg,avg+3,[](double & c){c/=yimg.width()*yimg.height();});
      delete[] rgb;

      // Size of square being rendered
      assert(((s != 0) && ((s & (~s + 1)) == s)) && "s should be power of 2");
      size_t h = s/2;
      // odd or even ping-pong iteration
      int odd = 1;
      // Tell shader about texel step size
      glUseProgram(reduction_prog_id);
      glUniform1f(glGetUniformLocation(reduction_prog_id,"dt"),0.5f/float(s));
      glEnable(GL_TEXTURE_2D);
      while(h)
      {
        // Select which texture to draw/compute into
        glBindFramebufferEXT(GL_FRAMEBUFFER_EXT, fbo_id[odd]);
        glBindRenderbufferEXT(GL_RENDERBUFFER_EXT, dfbo_id[odd]);
        // Select which texture to draw/compute from
        glBindTexture(GL_TEXTURE_2D,tex_id[1-odd]);
        // Scale to smaller square
        glViewport(0,0,h,h);
        const float f = 2.*(float)h/(float)s;
        // Draw quad filling viewport with shrinking texture coordinates
        glBegin(GL_QUADS);
        glTexCoord2f(0,0);
        glVertex2f  (-1,-1);
        glTexCoord2f(f,0);
        glVertex2f  (1,-1);
        glTexCoord2f(f,f);
        glVertex2f  (1,1);
        glTexCoord2f(0,f);
        glVertex2f  (-1,1);
        glEnd();

        // ping-pong
        odd = 1-odd;
        h = h/2;
      }
      // Read corner pixel of last render
      float px[3];
      glReadPixels(0, 0, 1, 1, GL_RGB, GL_FLOAT, px);
      // Correct for size not power of 2
      for_each(px,px+3,[](float& c){c*=(float)s*s/yimg.width()/yimg.height();});
      cout<<" gpu: "<< px[0]<<" "<< px[1]<<" "<< px[2]<<" "<<endl;
      cout<<" cpu: "<<avg[0]<<" "<<avg[1]<<" "<<avg[2]<<" "<<endl;

      // Purely for vanity, draw the buffers
      glUseProgram(0);
      glBindFramebufferEXT(GL_FRAMEBUFFER_EXT,0);
      glBindRenderbufferEXT(GL_RENDERBUFFER_EXT,0);
      glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);
      for(odd = 0;odd<2;odd++)
      {
        glViewport(odd*s,0,s,s);
        glEnable(GL_TEXTURE_2D);
        glBindTexture(GL_TEXTURE_2D,tex_id[odd]);
        glBegin(GL_QUADS);
        glTexCoord2f(0,0);
        glVertex2f  (-1,-1);
        glTexCoord2f(1,0);
        glVertex2f  (1,-1);
        glTexCoord2f(1,1);
        glVertex2f  (1,1);
        glTexCoord2f(0,1);
        glVertex2f  (-1,1);
        glEnd();
      }
      glutSwapBuffers();
    }
   );
  init_render_to_texture(s,s,tex_id[0],fbo_id[0],dfbo_id[0]);
  init_render_to_texture(s,s,tex_id[1],fbo_id[1],dfbo_id[1]);


  // Vertex shader is a **true** pass-through
  const std::string vertex_shader = R"(
#version 120
void main()
{
  gl_Position = gl_Vertex;
  gl_TexCoord[0] = gl_MultiTexCoord0;
}
)";
  // fragment shader sums texture of this pixel and left/bottom neighbors
  const std::string fragment_shader = R"(
#version 120
// size of a half-texel in 1/pixels units: 0.5/(full size)
uniform float dt;
uniform sampler2D texture;
void main()
{
  gl_FragColor = 0.25*(
    texture2D(texture, gl_TexCoord[0].st - vec2( 0, 0)) +
    texture2D(texture, gl_TexCoord[0].st - vec2(dt, 0)) +
    texture2D(texture, gl_TexCoord[0].st - vec2(dt,dt)) +
    texture2D(texture, gl_TexCoord[0].st - vec2( 0,dt)));
}
)";

  // Compile and link reduction shaders into program
  const auto & compile_shader = [](const GLint type, const char * str)->GLuint
  {
    GLuint id = glCreateShader(type);
    glShaderSource(id,1,&str,NULL);
    glCompileShader(id);
    return id;
  };
  GLuint vid = compile_shader(GL_VERTEX_SHADER,vertex_shader.c_str());
  GLuint fid = compile_shader(GL_FRAGMENT_SHADER,fragment_shader.c_str());
  reduction_prog_id = glCreateProgram();
  glAttachShader(reduction_prog_id,vid);
  glAttachShader(reduction_prog_id,fid);
  glLinkProgram(reduction_prog_id);

  glutMainLoop();
}