BatchFlow: Easily batch process collections sequentially or in parallel in JavaScript/Node.js

Why?

I really got tired of writing the following patterns over and over again:

Sequential:

var files = [... list of files ...];
function again(x) {
    if (x < files.length) {
		fs.readFile(files[x], function(err, data) {
			//... do something with data ...
			again(x + 1);
		});
	} else {
		console.log('Done.');
	}
}

again(0);

or..

Parallel:

var files = [... list of files ...];
var pending = 0;
files.forEach(function(file, i) {
	pending += 1;
	fs.readFile(file, function(err, data) {
		//... do something with data ....
		
		pending -= 1;
		if (pending === 0 && i === files.length -1) {
			console.log('Done.');
		}
	});
});

That’s ugly. For more complicated examples it requires a bit more thinking.

Why don’t I use the wonderful library async? Well, `async` tries to do way too much. I also suffer from a server case of NIH syndrome. Kidding, or else I’d rewrite Express.js. Or, am I? Muahahhaa. `async` syntax is also very ugly and not CoffeeScript friendly.

Installation

npm install batchflow

Examples

Arrays

Let’s rewrite the previous sequential example:

Sequential:

var batch = require('batchflow');

var files = [... list of files ...];
batch(files).sequential()
.each(function(i, item, done) {
	fs.readFile(item, function(err, data) {
		//do something with data
		done(someResult);
	});
}).end(function(results) {
	//analyze results
});

How about the parallel example?

Parallel:

var batch = require('batchflow');

var files = [... list of files ...];
batch(files).parallel()
.each(function(i, item, done) {
	fs.readFile(item, function(err, data) {
		//do something with data
		done(someResult); //<---- yes, you must still call done in parallel, this way we can know when to trigger `end()`.
	});
}).end(function(results) {
	//analyze results
});
```

What’s that, your data is not stored in an array? Oh, you say it’s stored in an object? That’s OK too…

Objects

Sequential:

var batch = require('batchflow');

var files = {'file1': 'path'.... 'filen': 'pathn'}
batch(files).sequential()
.each(function(key, val, done) {
	fs.readFile(val, function(err, data) {
		//do something with data
		done(someResult);
	});
}).end(function(results) {
	//analyze results
});

How about the parallel example?

Parallel:

var batch = require('batchflow');

var files = {'file1': 'path'.... 'filen': 'pathn'}
batch(files).parallel()
.each(function(key, val, done) {
	fs.readFile(val, function(err, data) {
		//do something with data
		done(someResult);
	});
}).end(function(results) {
	//analyze results
});

Misc

1. Is `sequential()` or `parallel()` too long? Fine. `series()` and `seq()` are aliases for `sequential()` and `par()` is an alias for `parallel()`.
2. You don’t like the fluent API? That’s OK too:

Non-fluent API BatchFlow

var batch = require('batchflow');
var bf = batch(files);
bf.isSequential = true;

bf.each(function(i, file, done) {
	done(someResult);
});
 
bf.end(function(results) {
	//blah blah
});

CoffeeScript

batch = require('batchflow')
files = [... list of files ...]
bf = batch(files).seq().each (i, file, done) ->
  fs.readFile file, done
bf.error (err) ->
  console.log(err);
bf.end (results) ->
  console.log fr.toString() for fr in results

Error Handling

What’s that, you want error handling? Well, you might as well call me Burger King… have it your way.

var a = {'f': '/tmp/file_DOES_NOT_exist_hopefully' + Math.random()};
batch(a).parallel().each(function(i, item, done) {
    fs.readFile(item, done);
}).error(function(err) {
    assert(err);
    done();
}).end(function() {
    assert(false); //<--- shouldn't get here
});


var a = ['/tmp/file_DOES_NOT_exist_hopefully' + Math.random()];
batch(a).series().each(function(i, item, done) {
    throw new Error('err');
}).error(function(err) {
    assert(err);
    done();
}).end(function() {
    assert(false); //<--- shouldn't get here
});

You can grab the source on Github.

If you use Git with others, you should checkout Gitpilot to make collaboration with Git simple using a different GUI. We would love your feedback.

Follow me on Twitter: @jprichardson

-JP

Like Unix Expect: Automate Command Line Programs in Node.js with Suppose

Have you ever heard of the command line program expect? Basically, expect allows you to automate command line programs. suppose is a programmable Node.js module that allows the same behavior.

Why would you do this? Maybe you want to automate a ssh session? Or, maybe you want to test the external interface of on of your Node.js command line scripts.

Install:

npm install suppose

Example:

process.chdir('/tmp/awesome');
suppose('npm', ['init'])
  .on('name: (awesome) ').respond('awesome_package\n')
  .on('version: (0.0.0) ').respond('0.0.1\n')
  .on('description: ').respond("It's an awesome package man!\n")
  .on('entry point: (index.js) ').respond("\n")
  .on('test command: ').respond('npm test\n')
  .on('git repository: ').respond("\n")
  .on('keywords: ').respond('awesome, cool\n')
  .on('author: ').respond('JP Richardson\n')
  .on('license: (BSD) ').respond('MIT\n')
  .on('ok? (yes) ' ).respond('yes\n')
.end(function(code){
    assert(code === 0);
    var packageFile = '/tmp/awesome/package.json';
    fs.readFile(packageFile, function(err, data){
        var packageObj = JSON.parse(data.toString());
        assert(packageObj.name === 'awesome_package');
        assert(packageObj.version === '0.0.1');
        assert(packageObj.description === "It's an awesome package man!");
        assert(packageObj.main === 'index.js');
        assert(packageObj.scripts.test === 'npm test');
        assert(packageObj.keywords[0] === 'awesome');
        assert(packageObj.keywords[1] === 'cool');
        assert(packageObj.author === 'JP Richardson');
        assert(packageObj.license === 'MIT');
        done();
    });
});

Pretty easy, huh? You can grab the source on Github.

If you use Git with others, you should checkout Gitpilot to make collaboration with Git simple using a different GUI. We would love your feedback.

Follow me on Twitter: @jprichardson

-JP

NextFlow: Sane CoffeeScript Flow Control

Take a look at the most prominent JavaScript control flow libraries: Async.js, Step, Seq. If you were to use these libraries in CoffeeScript, your code would be an ugly mess.

Async.js / CoffeeScript

async = require('async')

async.series(
  (->
    #first function
  ),
  (->
    #second function
  )
)

Step / CoffeeScript

Step = require('step')

Step(
  (->
    #first function
  ),
  (->
    #second function
  )
)

Seq / CoffeeScript

Seq = require('seq')

Seq().seq(->
  #first function
).seq(->
  #second function
)

Yuck. If you’re programming in JavaScript, all of them are very usable solutions. Also, to be fair, they do a lot more than NextFlow. But NextFlow looks much nicer with CoffeeScript programs.

How to Install:

npm install --production nextflow

Can be used in the browser too.

Execute sequentially, calling the `next()` function:

next = require('nextflow')

vals = []
x = 0

flow =
  1: ->
    vals.push(1)
    @next()
  2: ->
    vals.push(2)
    x = Math.random()
    @next(x)
  3: (num) ->
    vals.push(num)
    @next()
  4: ->
    vals.push(4)
    @next()
  5: ->
    console.log vals[0] #is 1
    console.log vals[1] #is 2
    console.log vals[2] #is x
    console.log vals[3] #is 4

next(flow)

Call functions by the label:

vals = []
x = 0

flow =
  a1: ->
    vals.push(1)
    @a2()
  a2: ->
    vals.push(2)
    x = Math.random()
    @a3(x)
  a3: (num) ->
    vals.push(num)
    @a4()
  a4: ->
    vals.push(4)
    @a5()
  a5: ->
    console.log vals[0] #is 1
    console.log vals[1] #is 2
    console.log vals[2] #is x
    console.log vals[3] #is 4

next(flow)

Call either `next()` or call the label:

vals = []
x = 0
y = 0

flow =
  a1: ->
    vals.push(1)
    @a2()
  a2: ->
    vals.push(2)
    x = Math.random()
    @a3(x)
  a3: (num) ->
    vals.push(num)
    y = Math.random()
    @next(y)
  a4: (num) ->
    vals.push(num)
    @a5()
  a5: ->
    console.log vals[0] #is 1
    console.log vals[1] #is 2
    console.log vals[2] #is x
    console.log vals[3] #is y

next(flow)

NextFlow on Github

Checkout Gitpilot, to become more productive with Git.

Follow me on Twitter: @jprichardson

-JP

Thinking Asynchronously in CoffeeScript/JavaScript: Loops and Callbacks

Awhile back, I wrote about my new experience in learning Node.js: A Node.js Experiment: Thinking Asynchronously, Using Recursion to Calculate the Total File Size in a Directory.

Consider this snippet of code:

var names = ['JP', 'Chris', 'Leslie'];
for (var i = 0; i < names.length; ++i){
  var name = names[i];
  setTimeout(function(){
    alert(name);              
  },10);
}​

Equivalent CoffeeScript:

names = ['JP', 'Chris', 'Leslie']
for name in names
  setTimeout(->
    alert(name)
  ,10)

Click here to run it.

If you guessed that the loop would alert “Leslie” three times, then you’d be correct.

The problem is, that before the callback executes, the loop has completed. Thus callback always has the last value.

How do you solve this problem? You wrap the callback in a closure that executes immediately.

JavaScript:

var names = ['JP', 'Chris', 'Leslie'];
for (var i = 0; i < names.length; ++i){
  var name = names[i];
  (function(name){
    setTimeout(function(){
      alert(name);              
    },10);
  })(name);
}​

CoffeeScript:

names = ['JP', 'Chris', 'Leslie']
for name in names
  do (name) ->
    setTimeout(->
      alert(name)
    ,10)

Click here to run it.

These solutions execute the block of code in a parallel manner. Using the alert’s are not a good indication in showing this behavior. However, if you were opening files, all of them would be opened approximately (not exactly) at the same time.

What if you wanted to perform the action in the callback in a serial manner?

Using the previous simple example, it’d look like this:

JavaScript:

var names = ['JP', 'Chris', 'Leslie'];
loop = function(i){
    setTimeout(function(){
      alert(names[i]);
      if (i < names.length - 1)
        loop(i + 1);       
    },10);
}
loop(0);

CoffeeScript:

names = ['JP', 'Chris', 'Leslie'];
doloop = (i) ->
  setTimeout(->
    alert(names[i])
    if i < names.length - 1
      doloop(i + 1)       
  ,10);
doloop(0)

Run it.

If you were doing file processing in the loop, it would be executed sequentially.

Hopefully this helps you to better understand asynchronous design of algorithms in JavaScript.

Update:
I forgot about the forEach function that exists in Node.js and most modern browsers. This function pretty much solves the problem.

Here’s the JavaScript code:

var names = ['JP', 'Chris', 'Leslie'];
names.forEach(function(name){
  setTimeout(function(){
    alert(name);              
  },10);
}​);

Much cleaner. Thanks to smog_alado [Reddit] for the reminder.

Checkout Gitpilot, a different kind of Git GUI.

Follow me on Twitter: @jprichardson

-JP

Why Do All the Great Node.js Developers Hate CoffeeScript?

Why do all the great Node.js developers hate CoffeeScript?

Take a look at the following Github repositories of the well-known Node.js developers:

Did you look at them? Not one of them has a project (that isn’t forked) that is written in CoffeeScript. So does the absence of CoffeeScript on Github imply these developers hate it? Absolutely not. Listen to episode 18 or 19 of Nodeup (don’t remember which one) but there are a couple of instances where they (expert Node.js devs) joke and laugh about writing in CoffeeScript. If this offensive? Of course not. But the attitude is curious to me.

One of the aforementioned developers said the following about a technology:

What if we could omit braces? How about semi-colons?

Sounds like the developer is talking about CoffeeScript, doesn’t it? No, it was TJ Holowaychuk describing Stylus, his CSS replacement language. Look at Stylus, look how CoffeeScript-esque it is. This is the the same TJ that doesn’t like CoffeeScript. This is meant to be partially tongue & cheek, but it does lend credance to my point.

Can you guess what the second most depended-upon package is on NPM? If you guessed CoffeeScript, you’d be right!

So if it’s the second most depended-upon package, it must be in use by us mere-mortal developers. Having defected from Rails, I love CoffeeScript. But, I ask again, why do the greats have a haughty attitude towards CoffeeScript? This isn’t meant to be a crusade trying to get people to convert to the holier-than-though CoffeeScript, but a genuine lack of understanding of why the disdain exists. Especially given the acceptance towards Haml, SASS, SCSS, Jade, etc. I mean, when it comes down to it, write in whatever makes you happy, but I feel like I’m missing something. If you’re part of the Node.js community, you’ll know what I’m talking about.

Looking over the CoffeeScript page, I think that you can safely conclude that in general, you’ll write less lines of code using CoffeeScript. Code is our enemy so that’s a good thing.

What do you think about CoffeeScript? Why do you think these developers don’t like CoffeeScript?

More fun CoffeeScript hatred:

If you use Git with others, you should checkout Gitpilot to make collaboration with Git simple. We would love your advice.

If you made it this far, follow me on Twitter: @jprichardson

-JP

Quick and Dirty Screen Scraping with Node.js using Request and Cheerio

I wrote my own screen scraping module built on PhantomJS, but unfortunately it’s too slow for most screen scraping tasks that don’t require browser-side JavaScript. One easy way to scrape pages with Node.js is to use Request and Cheerio.

Here is an example of scraping Bing to get all of the search results:

var request = require('request');
var cheerio = require('cheerio');

var searchTerm = 'screen+scraping';
var url = 'http://www.bing.com/search?q=' + searchTerm;

request(url, function(err, resp, body){
  $ = cheerio.load(body);
  links = $('.sb_tlst h3 a'); //use your CSS selector here
  $(links).each(function(i, link){
    console.log($(link).text() + ':\n  ' + $(link).attr('href'));
  });
});

Cheerio acts a jQuery replacement for a lot of jQuery tasks. It doesn’t replicate jQuery in every way, and most importantly it’s not meant for the browser but for the server. But it beats the pants off of the jsdom/jQuery combo for screen scraping.

Do you use Git? If so, checkout Gitpilot to make collaborating on software development easy.

You should follow me on Twitter: @jprichardson.

-JP

Submitting/Posting Files and Fields to an HTTP Form using C#/.NET

Awhile back, I had to integrate a C# program with a web system that allowed the user to upload a few files and include some misc. data. I Googled around and didn't find a comprehensive solution.

I did use some code I found on the internet, unfortunately I don't remember where, so I can't give proper attribution. If you know, please let me know; it's the code relevant to the MimePart class. I added the form values code and packaged it up into the HttpForm sugar.

Here is the code:

public class HttpForm {

    private Dictionary<string, string> _files = new Dictionary<string, string>();
    private Dictionary<string, string> _values = new Dictionary<string, string>();

    public HttpForm(string url) {
        this.Url = url;
        this.Method = "POST";
    }

    public string Method { get; set; }
    public string Url { get; set; }

    //return self so that we can chain
    public HttpForm AttachFile(string field, string fileName) {
        _files[field] = fileName;
        return this;
    }

    public HttpForm ResetForm(){
        _files.Clear();
        _values.Clear();
        return this;
    }

    //return self so that we can chain
    public HttpForm SetValue(string field, string value) {
        _values[field] = value;
        return this;
    }

    public HttpWebResponse Submit() {
        return this.UploadFiles(_files, _values);
    }


    private HttpWebResponse UploadFiles(Dictionary<string, string> files, Dictionary<string, string> otherValues) {
        var req = (HttpWebRequest)WebRequest.Create(this.Url);

        req.Timeout = 10000 * 1000;
        req.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8";
        req.AllowAutoRedirect = false;

        var mimeParts = new List<MimePart>();
        try {
            if (otherValues != null) {
                foreach (var fieldName in otherValues.Keys) {
                    var part = new MimePart();

                    part.Headers["Content-Disposition"] = "form-data; name=\"" + fieldName + "\"";
                    part.Data = new MemoryStream(Encoding.UTF8.GetBytes(otherValues[fieldName]));

                    mimeParts.Add(part);
                }
            }

            if (files != null) {
                foreach (var fieldName in files.Keys) {
                    var part = new MimePart();

                    part.Headers["Content-Disposition"] = "form-data; name=\"" + fieldName + "\"; filename=\"" + files[fieldName] + "\"";
                    part.Headers["Content-Type"] = "application/octet-stream";
                    part.Data = File.OpenRead(files[fieldName]);

                    mimeParts.Add(part);
                }
            }

            string boundary = "----------" + DateTime.Now.Ticks.ToString("x");

            req.ContentType = "multipart/form-data; boundary=" + boundary;
            req.Method = this.Method;

            long contentLength = 0;

            byte[] _footer = Encoding.UTF8.GetBytes("--" + boundary + "--\r\n");

            foreach (MimePart part in mimeParts) {
                contentLength += part.GenerateHeaderFooterData(boundary);
            }

            req.ContentLength = contentLength + _footer.Length;

            byte[] buffer = new byte[8192];
            byte[] afterFile = Encoding.UTF8.GetBytes("\r\n");
            int read;

            using (Stream s = req.GetRequestStream()) {
                foreach (MimePart part in mimeParts) {
                    s.Write(part.Header, 0, part.Header.Length);

                    while ((read = part.Data.Read(buffer, 0, buffer.Length)) > 0)
                        s.Write(buffer, 0, read);

                    part.Data.Dispose();

                    s.Write(afterFile, 0, afterFile.Length);
                }

                s.Write(_footer, 0, _footer.Length);
            }

            var res = (HttpWebResponse)req.GetResponse();

            return res;
        } catch (Exception ex) {
            Console.WriteLine(ex.Message);
            foreach (MimePart part in mimeParts)
                if (part.Data != null)
                    part.Data.Dispose();

            return (HttpWebResponse)req.GetResponse();
        }
    }

    private class MimePart {
        private NameValueCollection _headers = new NameValueCollection();
        public NameValueCollection Headers { get { return _headers; } }

        public byte[] Header { get; protected set; }

        public long GenerateHeaderFooterData(string boundary) {
            StringBuilder sb = new StringBuilder();

            sb.Append("--");
            sb.Append(boundary);
            sb.AppendLine();
            foreach (string key in _headers.AllKeys) {
                sb.Append(key);
                sb.Append(": ");
                sb.AppendLine(_headers[key]);
            }
            sb.AppendLine();

            Header = Encoding.UTF8.GetBytes(sb.ToString());

            return Header.Length + Data.Length + 2;
        }

        public Stream Data { get; set; }
    }
}

You can easily use it like so:

var file1 = @"C:\file";
var file2 = @"C:\file2";

var yourUrl = "http://yourdomain.com/process.php";
var httpForm = new HttpForm(yourUrl);
httpForm.AttachFile("file1", file1).AttachFile("file2", file2);
httpForm.setValue("foo", "some foo").setValue("blah", "rarrr!");
httpForm.Submit();

Do you use Git? If so, checkout Gitpilot to make using Git thoughtless.

Follow me on Twitter: @jprichardson.

-JP Richardson

Follow

Get every new post delivered to your Inbox.