Uploaded image for project: 'HPCC'
  1. HPCC
  2. HPCC-14788

Investigate (auto) vectorization SIMD

    XMLWordPrintable

    Details

    • Type: Improvement
    • Status: Resolved
    • Priority: Not specified
    • Resolution: Timed Out
    • Affects Version/s: 5.4.6
    • Fix Version/s: None
    • Component/s: Build process
    • Labels:

      Description

      We should take the time to investigate utilizing vectorization.

      This could start with at least investigating produced code and performance gains from using the g++ flag -ftree-vectorize. https://gcc.gnu.org/projects/tree-ssa/vectorization.html#using

      See also http://www.cs.columbia.edu/~orestis/sigmod15.pdf

      Example, take, COMBINE is effectively used as an element-by-element vec-op , perhaps, mostly used in the ML library. Vectorizing these activities may yield significant performance boosts. Similarly the same is true for PROJECT. At present this is really only applicable to EMBEDED child datasets.

      Take the following ECL:

      valueRecord := RECORD
          REAL8 value;
      END;
      
      inputRecord := RECORD
          UNSIGNED id;
          EMBEDDED DATASET(valueRecord) values;
      END;
      
      ds1(unsigned c) := DATASET(1000, TRANSFORM(valueRecord, SELF.value := c * 1234 + COUNTER));
      
      ds := DATASET(100, TRANSFORM(inputRecord, SELF.id :=  COUNTER; SELF.values := ds1(COUNTER)));
      
      valueRecord processValue(valueRecord l) := TRANSFORM
          SELF.value := l.value* 3.14;
      END;
      
      p := PROJECT(NOFOLD(ds), TRANSFORM(inputRecord, SELF.id := LEFT.id; SELF.values := PROJECT(LEFT.values, processValue(LEFT))));
      
      output(p);
      

      producing the following generated C++ eclcc -v -fspanmultiplecpp=0 -fsaveecltempfiles demo.ecl)

      /* Template for generating thor/hthor/roxie output */
      #include "eclinclude4.hpp"
      #include "eclrtl.hpp"
      
      
      
      const RtlIntTypeInfo ty2(0x101,8);
      const RtlFieldStrInfo rf1("id",NULL,&ty2);
      const RtlRealTypeInfo ty5(0x2,8);
      const RtlFieldStrInfo rf2("value",NULL,&ty5);
      const RtlFieldInfo * const tl6[] = { &rf2, 0 };
      const RtlRecordTypeInfo ty4(0xd,8,tl6);
      const RtlDatasetTypeInfo ty3(0x414,0,&ty4);
      const RtlFieldStrInfo rf3("values","values\001Row",&ty3);
      const RtlFieldInfo * const tl7[] = { &rf1,&rf3, 0 };
      const RtlRecordTypeInfo ty1(0x40d,4096,tl7);
      struct dmi1 : public COutputRowDeserializer {
      	inline dmi1(unsigned _activityId) : COutputRowDeserializer(_activityId) {}
      	virtual size32_t deserialize(ARowBuilder & crSelf, IRowDeserializerSource & in) {
      		crSelf.getSelf();
      		in.read(12U,crSelf.row() + 0U);
      		unsigned v4;
      		v4 = *((unsigned *)(crSelf.row() + 8U));
      		crSelf.ensureCapacity(v4 + 12U,"values");
      		in.read(v4,crSelf.row() + 12U);
      		return v4 + 12U;
      	}
      };
      extern ECL_API IOutputRowDeserializer * crdmi1(ICodeContext * ctx, unsigned activityId) {
      	dmi1* p = new dmi1(activityId); 
      	p->onCreate(ctx);
      	return p;
      }
      struct pmi1 : public CSourceRowPrefetcher {
      	inline pmi1(unsigned _activityId) : CSourceRowPrefetcher(_activityId) {}
      	virtual void readAhead(IRowDeserializerSource & in) {
      		in.skip(8U);
      		unsigned v5 = in.readSize();
      		in.skip(v5);
      	}
      };
      struct mi2 : public CFixedOutputMetaData {
      	inline mi2() : CFixedOutputMetaData(8) {}
      	virtual const RtlTypeInfo * queryTypeInfo() const { return &ty4; }
      } mx2;
      extern "C" ECL_API IOutputMetaData * mf2() { mx2.Link(); return &mx2; }
      struct mi1 : public CVariableOutputMetaData {
      	inline mi1() : CVariableOutputMetaData(12) {}
      	virtual size32_t getRecordSize(const void * data) {
      		if (!data) return 4096;
      		const unsigned char * left = (const unsigned char *)data;
      		unsigned v6 = *((unsigned *)(left + 8U)) + 4U;
      		return v6 + 8U;
      	}
      	virtual const RtlTypeInfo * queryTypeInfo() const { return &ty1; }
      	virtual IOutputMetaData * queryChildMeta(unsigned i) {
      		switch (i) {
      			case 0U:
      				return &mx2;
      		}
      		return NULL;
      	}
      	virtual IOutputRowDeserializer * createDiskDeserializer(ICodeContext * ctx, unsigned activityId) {
      		return crdmi1(ctx, activityId);
      	}
      	virtual CSourceRowPrefetcher * doCreateDiskPrefetcher(unsigned activityId) {
      		return new pmi1(activityId);
      	}
      	virtual unsigned getMetaFlags() { return 50; }
      } mx1;
      extern "C" ECL_API IOutputMetaData * mf1() { mx1.Link(); return &mx1; }
      
      struct cAc2 : public CThorInlineTableArg {
      	virtual IOutputMetaData * queryOutputMeta() { return &mx1; }
      	virtual size32_t getRow(ARowBuilder & crSelf, __uint64 row) {
      		crSelf.getSelf();
      		*((unsigned long long *)(crSelf.row() + 0U)) = (row+1);
      		RtlFixedDatasetBuilder cr1(8, 0);
      		unsigned long long v2;
      		v2 = 1LLU;
      		for (;v2 <= 1000LL;v2++) {
      			cr1.createRow();
      			*((double *)(cr1.rowBuilder().row() + 0U)) = (double)((long long)(row+1) * 1234LL + (long long)v2);
      			cr1.finalizeRow(8U);
      		}
      		unsigned v3;
      		v3 = cr1.getSize();
      		crSelf.ensureCapacity(v3 + 12U,"values");
      		*((unsigned *)(crSelf.row() + 8U)) = v3;
      		memcpy(crSelf.row() + 12U,cr1.queryData(),v3);
      		return v3 + 12U;
      	}
      	virtual unsigned long long numRows() {
      		return 100LLU;
      	}
      	virtual unsigned getFlags() { return TTFnoconstant; }
      };
      extern "C" ECL_API IHThorArg * fAc2() { return new cAc2; }
      struct cAc3 : public CThorProjectArg {
      	virtual IOutputMetaData * queryOutputMeta() { return &mx1; }
      	virtual size32_t transform(ARowBuilder & crSelf, const void * _left) {
      		crSelf.getSelf();
      		const unsigned char * left = (const unsigned char *) _left;
      		*((unsigned long long *)(crSelf.row() + 0U)) = *((unsigned long long *)(left + 0U));
      		RtlFixedDatasetBuilder cr7(8, 0);
      		byte * row8;
      		row8 = (byte *)(byte *)(left + 12U);
      		byte * end9;
      		end9 = row8+*((unsigned *)(left + 8U));
      		for (;row8 < end9;) {
      			cr7.createRow();
      			*((double *)(cr7.rowBuilder().row() + 0U)) = *((double *)(row8 + 0U)) * 3.14;
      			cr7.finalizeRow(8U);
      			row8 += 8U;
      		}
      		unsigned vA;
      		vA = cr7.getSize();
      		crSelf.ensureCapacity(vA + 12U,"values");
      		*((unsigned *)(crSelf.row() + 8U)) = vA;
      		memcpy(crSelf.row() + 12U,cr7.queryData(),vA);
      		return vA + 12U;
      	}
      };
      extern "C" ECL_API IHThorArg * fAc3() { return new cAc3; }
      struct cAc4 : public CThorWorkUnitWriteArg {
      	virtual int getSequence() { return 0; }
      	virtual IOutputMetaData * queryOutputMeta() { return &mx1; }
      	virtual void serializeXml(const byte * self, IXmlWriter & out) {
      		mx1.toXML(self, out);
      	}
      };
      extern "C" ECL_API IHThorArg * fAc4() { return new cAc4; }
      
      
      struct MyEclProcess : public EclProcess {
      	virtual unsigned getActivityVersion() const { return ACTIVITY_INTERFACE_VERSION; }
      	virtual int perform(IGlobalCodeContext * gctx, unsigned wfid) {
      		ICodeContext * ctx;
      		ctx = gctx->queryCodeContext();
      		switch (wfid) {
      			case 1U: {
      				ctx->executeGraph("graph1",false,0,NULL);
      			}
      			break;
      		}
      		return 1U;
      	}
      };
      int main(int argc, const char *argv[]) {
      	return start_query(argc, argv);
      
      }
      
      
      extern "C" ECL_API IEclProcess* createProcess()
      {
      
          return new MyEclProcess;
      }
      

      This can be compiled (/usr/bin/g++ a.out.cpp -O3 -std=c++11 -fopt-info-vec -fvect-cost-model=unlimited -ftree-vectorize -ftree-vectorizer-verbose=2 -fvisibility=hidden -DUSE_VISIBILITY=1 -fPIC -m64 -c -I/opt/HPCCSystems/componentfiles/cl/include -o a.out-opt.cpp.o
      /usr/bin/g++ -L. -Wl,-E -fPIC -pipe -L/opt/HPCCSystems/lib -Wl,-rpath,/opt/HPCCSystems/lib -L/opt/HPCCSystems/plugins -Wl,-rpath,/opt/HPCCSystems/plugins a.out-opt.cpp.o -leclrtl -lhthor a.out.res.o -o a.out-opt) to give:

      /* Template for generating thor/hthor/roxie output */
      #include "eclinclude4.hpp"
      #include "eclrtl.hpp"
      
      
      
      const RtlIntTypeInfo ty2(0x101,8);
      const RtlFieldStrInfo rf1("id",NULL,&ty2);
      const RtlRealTypeInfo ty5(0x2,8);
      const RtlFieldStrInfo rf2("value",NULL,&ty5);
      const RtlFieldInfo * const tl6[] = { &rf2, 0 };
      const RtlRecordTypeInfo ty4(0xd,8,tl6);
      const RtlDatasetTypeInfo ty3(0x414,0,&ty4);
      const RtlFieldStrInfo rf3("values","values\001Row",&ty3);
      const RtlFieldInfo * const tl7[] = { &rf1,&rf3, 0 };
      const RtlRecordTypeInfo ty1(0x40d,4096,tl7);
      struct dmi1 : public COutputRowDeserializer {
      	inline dmi1(unsigned _activityId) : COutputRowDeserializer(_activityId) {}
      	virtual size32_t deserialize(ARowBuilder & crSelf, IRowDeserializerSource & in) {
      		crSelf.getSelf();
      		in.read(12U,crSelf.row() + 0U);
      		unsigned v4;
      		v4 = *((unsigned *)(crSelf.row() + 8U));
      		crSelf.ensureCapacity(v4 + 12U,"values");
      		in.read(v4,crSelf.row() + 12U);
      		return v4 + 12U;
      	}
      };
      extern ECL_API IOutputRowDeserializer * crdmi1(ICodeContext * ctx, unsigned activityId) {
      	dmi1* p = new dmi1(activityId); 
      	p->onCreate(ctx);
      	return p;
      }
      struct pmi1 : public CSourceRowPrefetcher {
      	inline pmi1(unsigned _activityId) : CSourceRowPrefetcher(_activityId) {}
      	virtual void readAhead(IRowDeserializerSource & in) {
      		in.skip(8U);
      		unsigned v5 = in.readSize();
      		in.skip(v5);
      	}
      };
      struct mi2 : public CFixedOutputMetaData {
      	inline mi2() : CFixedOutputMetaData(8) {}
      	virtual const RtlTypeInfo * queryTypeInfo() const { return &ty4; }
      } mx2;
      extern "C" ECL_API IOutputMetaData * mf2() { mx2.Link(); return &mx2; }
      struct mi1 : public CVariableOutputMetaData {
      	inline mi1() : CVariableOutputMetaData(12) {}
      	virtual size32_t getRecordSize(const void * data) {
      		if (!data) return 4096;
      		const unsigned char * left = (const unsigned char *)data;
      		unsigned v6 = *((unsigned *)(left + 8U)) + 4U;
      		return v6 + 8U;
      	}
      	virtual const RtlTypeInfo * queryTypeInfo() const { return &ty1; }
      	virtual IOutputMetaData * queryChildMeta(unsigned i) {
      		switch (i) {
      			case 0U:
      				return &mx2;
      		}
      		return NULL;
      	}
      	virtual IOutputRowDeserializer * createDiskDeserializer(ICodeContext * ctx, unsigned activityId) {
      		return crdmi1(ctx, activityId);
      	}
      	virtual CSourceRowPrefetcher * doCreateDiskPrefetcher(unsigned activityId) {
      		return new pmi1(activityId);
      	}
      	virtual unsigned getMetaFlags() { return 50; }
      } mx1;
      extern "C" ECL_API IOutputMetaData * mf1() { mx1.Link(); return &mx1; }
      
      struct rowBuilder {
        rowBuilder(byte * _ptr) : ptr(_ptr) {}
        inline byte * next()
        {
          byte * temp = ptr;
          ptr = ptr + sizeof(double);
          return temp;
        }
        byte * ptr = nullptr;
      };
      
      const unsigned length = 1000;
      
      struct cAc2 : public CThorInlineTableArg {
      	virtual IOutputMetaData * queryOutputMeta() { return &mx1; }
      	virtual size32_t getRow(ARowBuilder & crSelf, __uint64 row) {
      	        crSelf.getSelf();
      	        *((unsigned long long *)(crSelf.row() + 0U)) = (row+1);
      		byte * target = new byte [sizeof(double)*length];
      		rowBuilder newRow(target); 
                      //#pragma GCC ivdep//only needed when target points to 'random' pointer.
      		for (unsigned i = 0; i < length; i++) {
      		  *((double *)newRow.next()) = (double)((long long)(row+1) * 1234LL);// + (long long)i); MORE: fix this
      		}
      		unsigned v3 = sizeof(double)*length;
      		crSelf.ensureCapacity(v3 + 16U,"values");
      		*((unsigned *)(crSelf.row() + 8U)) = v3;
      		memcpy(crSelf.row() + 16U,target,v3);
                      delete [] target;
      		return v3 + 16U; //was +12U
      	}
      	virtual unsigned long long numRows() {
      		return 100LLU;
      	}
      	virtual unsigned getFlags() { return TTFnoconstant; }
      };
      extern "C" ECL_API IHThorArg * fAc2() { return new cAc2; }
      struct cAc3 : public CThorProjectArg {
      	virtual IOutputMetaData * queryOutputMeta() { return &mx1; }
      	virtual size32_t transform(ARowBuilder & crSelf, const void * _left) {
      		crSelf.getSelf();
      		const unsigned char * left = (const unsigned char *) _left;
      		*((unsigned long long *)(crSelf.row() + 0U)) = *((unsigned long long *)(left + 0U));
      		byte * target = new byte [sizeof(double)*length];
      		byte * row8;
      		row8 = (byte *)(byte *)(left + 16U);//was +12U
      		rowBuilder newRow(target);
      		for (unsigned i = 0; i < length; i++) {
      		  *((double *)(newRow.next())) = *((double *)(row8 + 0U)) * 3.14;
      			row8 += 8U;
      		}
      		unsigned vA = sizeof(double)*length;
      		crSelf.ensureCapacity(vA + 12U,"values");
      		*((unsigned *)(crSelf.row() + 8U)) = vA;
      		memcpy(crSelf.row() + 12U,target,vA);
                      delete [] target;
      		return vA + 12U;
      	}
      };
      extern "C" ECL_API IHThorArg * fAc3() { return new cAc3; }
      struct cAc4 : public CThorWorkUnitWriteArg {
      	virtual int getSequence() { return 0; }
      	virtual IOutputMetaData * queryOutputMeta() { return &mx1; }
      	virtual void serializeXml(const byte * self, IXmlWriter & out) {
      		mx1.toXML(self, out);
      	}
      };
      extern "C" ECL_API IHThorArg * fAc4() { return new cAc4; }
      
      
      struct MyEclProcess : public EclProcess {
      	virtual unsigned getActivityVersion() const { return ACTIVITY_INTERFACE_VERSION; }
      	virtual int perform(IGlobalCodeContext * gctx, unsigned wfid) {
      		ICodeContext * ctx;
      		ctx = gctx->queryCodeContext();
      		switch (wfid) {
      			case 1U: {
      				ctx->executeGraph("graph1",false,0,NULL);
      			}
      			break;
      		}
      		return 1U;
      	}
      };
      int main(int argc, const char *argv[]) {
      	return start_query(argc, argv);
      
      }
      
      
      extern "C" ECL_API IEclProcess* createProcess()
      {
      
          return new MyEclProcess;
      }
      

      Attention needs to be given to record alignment to be able to successfully vectorize.

        Attachments

          Issue Links

            Activity

              People

              • Assignee:
                jamienoss Jamie Noss
                Reporter:
                jamienoss Jamie Noss
              • Votes:
                0 Vote for this issue
                Watchers:
                2 Start watching this issue

                Dates

                • Created:
                  Updated:
                  Resolved: