Uploaded image for project: 'HPCC'
  1. HPCC
  2. HPCC-12677

CSV dataset read has arbitrary 10M line limit

    XMLWordPrintable

    Details

    • Type: Bug
    • Status: Resolved
    • Priority: Major
    • Resolution: Fixed
    • Affects Version/s: 4.2.8, 5.0.2
    • Fix Version/s: 5.2.0
    • Component/s: Thor
    • Labels:
      None
    • Environment:
      USLM Dev

      Description

      702 builds did not have a 10M line limit on CSV datasets. The following code creates and attempts to read CSV records of configurable lengths. If the "repBig" attribute (line 9) is set to 100000 (to make an ~10M record), everything works. If set to 150000 (to make an ~15M record), an error results.

      // 100 character string
      body := u'0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789';
      
      rec := { unicode document {maxlength(200000000)}};
      
      workRec := {unsigned id, unsigned order, unsigned repitition, rec};
      
      repSmall := 150;
      repBig   := 150000;      // 15 MB doc - Fails in OSS - "File ... contained a line of length greater than 10485760 bytes." 
      // repBig   := 100000;   // 10 MB doc - Works
      
      structure := dataset([
                            {1, 1      , 1       , u'<doc>'},
                            {1, 2      , repSmall, u''},
                            {1, 1000000, 1       , u'</doc>'},
                            {2, 1      , 1       , u'<doc>'},
                            {2, 2      , repBig  , u''},
                            {2, 1000000, 1       , u'</doc>'},
                            {3, 1      , 1       , u'<doc>'},
                            {3, 2      , repBig  , u''},
                            {3, 1000000, 1       , u'</doc>'},
                            {4, 1      , 1       , u'<doc>'},
                            {4, 2      , repSmall, u''},
                            {4, 1000000, 1       , u'</doc>'}
                           ], workRec); 
      
      dist := sort(distribute(structure, id / 2),id,order,local);
      
      expanded := normalize(dist,
                            left.repitition,
                            transform(workRec,
                                      self.document := if(left.order = 2, body, left.document),
                                      self.order    := if(left.order = 2, counter + 1, left.order),
                                      self := left));
      
      workRec xfrm(workRec l, workRec r) := transform  
        self.document := l.document + r.document;
        self := l;
      end;
      
      // Speed up concatenation by putting 64K strings together first
      roll1 := rollup(expanded,
                       left.id = right.id 
                         and left.order div 64 = right.order div 64,
                       xfrm(left,right),
                       local);
      
      // Create intermediate concatenation
      roll2 := rollup(roll1,
                       left.id = right.id
                         and left.order div 2500 = right.order div 2500,
                       xfrm(left,right),
                       local);
      
      // Create full concatenation
      combined := rollup(roll2,
                         left.id = right.id,
                         xfrm(left,right),
                         local);
      
      
      ds := dataset('~joe::xml_batch_csv', rec, csv(separator(''),quote(''),maxlength(500000000)));
                                   
      sequential(
         output(combined,{document},'~joe::xml_batch_csv',csv(separator(''), quote('')),overwrite)
        ,output(ds,,'~joe::xml_batch_thor',overwrite)
      );
        
      // output(combined,{document},'~joe::xml_batch_csv',csv(separator(''), quote('')),overwrite);  
      // output(ds,,'~joe::xml_batch_thor',overwrite);  
      

        Attachments

          Issue Links

            Activity

              People

              • Assignee:
                jakesmith Jake Smith
                Reporter:
                joecella Joe Cella
              • Votes:
                0 Vote for this issue
                Watchers:
                3 Start watching this issue

                Dates

                • Created:
                  Updated:
                  Resolved: