Programming language shootout: k-nucleotide

From Gambit wiki

This is a Gambit implementation of the k-nucleotide benchmark of the Computer Language Benchmarks Game.

This implementation is quite slow; nearly all the time is spent in all-counts.

The program

;; The Computer Language Benchmarks Game

;; Derived by Bradley Lucier from the Ikarus variant
;; derived by Michael D. Adams from the MzScheme variant

(declare (standard-bindings)(extended-bindings)(block)(fixnum)(not safe))

(define (substring-copy! dest dest-start source source-start source-end)
  (do ((i source-start (+ i 1))
       (j dest-start   (+ j 1)))
      ((= i source-end))
    (string-set! dest j (string-ref source i))))

(define (strings->string strings)
  (let* ((total-length
	  (do ((i 0 (+ i (string-length (car strings))))
	       (strings strings (cdr strings)))
	      ((null? strings) i)))
	  (make-string total-length)))
    (let loop ((strings strings)
	       (i 0))
      (if (null? strings)
	  (let* ((string (car strings))
		 (n (string-length string)))
	    (substring-copy! result i string 0 n)
	    (loop (cdr strings)
		  (+ i n)))))))

(define (insertion-sort lst <)

  (define (insert item lst)
    (if (or (null? lst)
	    (< item (car lst)))
	(cons item lst)
	(cons (car lst)
	      (insert item (cdr lst)))))

  (if (null? lst)
      (insert (car lst) (insertion-sort (cdr lst) <))))


;;;  Boiler-plate for formatting floating point values

(define (roundto digits n)
  (declare (generic))
  (let* ((e (expt 10 digits))
         (num (round (abs (* e (inexact->exact n)))))
         (str (number->string (remainder num e))))
     (if (negative? n) "-" "")
     (number->string (quotient num e))
     (make-string (- digits (string-length str)) #\0)

(define (upcase! str)
  (let ((n (string-length str)))
    (do ((i 0 (+ i 1)))
	((= i n) str)
      (string-set! str i (char-upcase (string-ref str i))))))

(define (all-counts len dna)
  (let ((table (make-table test: string=?
			   size: (let ()
				   (declare (generic))
				   (min (expt 4 len) (string-length dna))))))
    (let loop ((s (- (string-length dna) len))
	       (seq (make-string len)))
      (if (< s 0)
	    (substring-copy! seq 0 dna s (+ s len))
	    (let* ((cnt (table-ref table seq 0)))
	      (table-set! table seq (+ 1 cnt))
	      (loop (- s 1)
		    (if (zero? cnt)
			(make-string len)

(define (write-freqs table)
  (declare (generic))
  (let* ((keys+values (table->list table))
	 (total  (apply + (map cdr keys+values))))
    (for-each (lambda (a)
		(display (car a)) (display " ")
		(display (roundto 3 (* 100 (/ (cdr a) total)))) (newline))
	      (insertion-sort keys+values (lambda (x y) (< (cdr y) (cdr x)))))))

(define (write-one-freq table key)
  (let ((cnt (table-ref table key 0)))
    (display cnt) (display "\t")
    (display key) (display "\n")))

(define dna
  (let ((port (current-input-port))
        (start ">THREE Homo sapiens frequency"))
    (let skip ((x (read-line port)))
      (if (not (string=? x start))
	  (skip (read-line port))
	  (let loop ((x (read-line port))
		     (s '()))
	    (if (eof-object? x)
		(strings->string (reverse s))
		(loop (read-line port)
		      (cons (upcase! x) s))))))))

;; 1-nucleotide counts:
(write-freqs (all-counts 1 dna))

;; 2-nucleotide counts:
(write-freqs (all-counts 2 dna))

;; Specific sequences:
(for-each (lambda (seq)
            (write-one-freq (all-counts (string-length seq) dna)


gsc k-nucleotide


gsi k-nucleotide < knucleotide-input1000000.txt

The file knucleotide-input1000000.txt is generated by fasta.

Note: Running the program with the largest data set creates a string of length 5,000,000. With four-byte characters (Gambit's default) this is 20,000,000 byte object, and the largest object that can be created in a 32-bit system is a bit less than 16MB. So, either configure Gambit with --enable-char-size=1 (or 2) or use a 64-bit version of Gambit.