split (String)

/*
 *  call-seq:
 *     str.split(pattern=$;, [limit])   => anArray
 *  
 *  Divides <i>str</i> into substrings based on a delimiter, returning an array
 *  of these substrings.
 *     
 *  If <i>pattern</i> is a <code>String</code>, then its contents are used as
 *  the delimiter when splitting <i>str</i>. If <i>pattern</i> is a single
 *  space, <i>str</i> is split on whitespace, with leading whitespace and runs
 *  of contiguous whitespace characters ignored.
 *     
 *  If <i>pattern</i> is a <code>Regexp</code>, <i>str</i> is divided where the
 *  pattern matches. Whenever the pattern matches a zero-length string,
 *  <i>str</i> is split into individual characters.
 *     
 *  If <i>pattern</i> is omitted, the value of <code>$;</code> is used.  If
 *  <code>$;</code> is <code>nil</code> (which is the default), <i>str</i> is
 *  split on whitespace as if ` ' were specified.
 *     
 *  If the <i>limit</i> parameter is omitted, trailing null fields are
 *  suppressed. If <i>limit</i> is a positive number, at most that number of
 *  fields will be returned (if <i>limit</i> is <code>1</code>, the entire
 *  string is returned as the only entry in an array). If negative, there is no
 *  limit to the number of fields returned, and trailing null fields are not
 *  suppressed.
 *     
 *     " now's  the time".split        #=> ["now's", "the", "time"]
 *     " now's  the time".split(' ')   #=> ["now's", "the", "time"]
 *     " now's  the time".split(/ /)   #=> ["", "now's", "", "the", "time"]
 *     "1, 2.34,56, 7".split(%r{,\s*}) #=> ["1", "2.34", "56", "7"]
 *     "hello".split(//)               #=> ["h", "e", "l", "l", "o"]
 *     "hello".split(//, 3)            #=> ["h", "e", "llo"]
 *     "hi mom".split(%r{\s*})         #=> ["h", "i", "m", "o", "m"]
 *     
 *     "mellow yellow".split("ello")   #=> ["m", "w y", "w"]
 *     "1,2,,3,4,,".split(',')         #=> ["1", "2", "", "3", "4"]
 *     "1,2,,3,4,,".split(',', 4)      #=> ["1", "2", "", "3,4,,"]
 *     "1,2,,3,4,,".split(',', -4)     #=> ["1", "2", "", "3", "4", "", ""]
 */

static VALUE
rb_str_split_m(argc, argv, str)
    int argc;
    VALUE *argv;
    VALUE str;
{
    VALUE spat;
    VALUE limit;
    int awk_split = Qfalse;
    long beg, end, i = 0;
    int lim = 0;
    VALUE result, tmp;

    if (rb_scan_args(argc, argv, "02", &spat, &limit) == 2) {
        lim = NUM2INT(limit);
        if (lim <= 0) limit = Qnil;
        else if (lim == 1) {
            if (RSTRING(str)->len == 0)
                return rb_ary_new2(0);
            return rb_ary_new3(1, str);
        }
        i = 1;
    }

    if (NIL_P(spat)) {
        if (!NIL_P(rb_fs)) {
            spat = rb_fs;
            goto fs_set;
        }
        awk_split = Qtrue;
    }
    else {
      fs_set:
        if (TYPE(spat) == T_STRING && RSTRING(spat)->len == 1) {
            if (RSTRING(spat)->ptr[0] == ' ') {
                awk_split = Qtrue;
            }
            else {
                spat = rb_reg_regcomp(rb_reg_quote(spat));
            }
        }
        else {
            spat = get_pat(spat, 1);
        }
    }

    result = rb_ary_new();
    beg = 0;
    if (awk_split) {
        char *ptr = RSTRING(str)->ptr;
        long len = RSTRING(str)->len;
        char *eptr = ptr + len;
        int skip = 1;

        for (end = beg = 0; ptr<eptr; ptr++) {
            if (skip) {
                if (ISSPACE(*ptr)) {
                    beg++;
                }
                else {
                    end = beg+1;
                    skip = 0;
                    if (!NIL_P(limit) && lim <= i) break;
                }
            }
            else {
                if (ISSPACE(*ptr)) {
                    rb_ary_push(result, rb_str_substr(str, beg, end-beg));
                    skip = 1;
                    beg = end + 1;
                    if (!NIL_P(limit)) ++i;
                }
                else {
                    end++;
                }
            }
        }
    }
    else {
        long start = beg;
        long idx;
        int last_null = 0;
        struct re_registers *regs;

        while ((end = rb_reg_search(spat, str, start, 0)) >= 0) {
            regs = RMATCH(rb_backref_get())->regs;
            if (start == end && BEG(0) == END(0)) {
                if (!RSTRING(str)->ptr) {
                    rb_ary_push(result, rb_str_new("", 0));
                    break;
                }
                else if (last_null == 1) {
                    rb_ary_push(result, rb_str_substr(str, beg, mbclen2(RSTRING(str)->ptr[beg],spat)));
                    beg = start;
                }
                else {
                    start += mbclen2(RSTRING(str)->ptr[start],spat);
                    last_null = 1;
                    continue;
                }
            }
            else {
                rb_ary_push(result, rb_str_substr(str, beg, end-beg));
                beg = start = END(0);
            }
            last_null = 0;

            for (idx=1; idx < regs->num_regs; idx++) {
                if (BEG(idx) == -1) continue;
                if (BEG(idx) == END(idx))
                    tmp = rb_str_new5(str, 0, 0);
                else
                    tmp = rb_str_substr(str, BEG(idx), END(idx)-BEG(idx));
                rb_ary_push(result, tmp);
            }
            if (!NIL_P(limit) && lim <= ++i) break;
        }
    }
    if (RSTRING(str)->len > 0 && (!NIL_P(limit) || RSTRING(str)->len > beg || lim < 0)) {
        if (RSTRING(str)->len == beg)
            tmp = rb_str_new5(str, 0, 0);
        else
            tmp = rb_str_substr(str, beg, RSTRING(str)->len-beg);
        rb_ary_push(result, tmp);
    }
    if (NIL_P(limit) && lim == 0) {
        while (RARRAY(result)->len > 0 &&
               RSTRING(RARRAY(result)->ptr[RARRAY(result)->len-1])->len == 0)
            rb_ary_pop(result);
    }

    return result;
}