;; -*- mode: scheme; coding: utf-8 -*-
;; Copyright (c) 2023 
;; SPDX-License-Identifier: MIT
#!r6rs

(library (vterm encoding)
  (export
    make-utf8-decoder
    make-ascii-decoder

    decode-init
    decode-text)
  (import (rnrs))

(define UNICODE-INVALID #xFFFD)

(define-record-type vterm-decoder
  (fields name init-fn decode-fn)
  (protocol
    (lambda (new)
      (lambda (name init-fn decode-fn)
        (new name init-fn decode-fn)))))

(define (decode-init dec)
  (let ([init-fn (vterm-decoder-init-fn dec)])
    (and init-fn (init-fn))))

;; returns (values '(code points) bytes-count)
(define decode-text
  (case-lambda
    [(dec bv)
     ((vterm-decoder-decode-fn dec) bv 0 (bytevector-length bv))]
    [(dec bv i len)
     ((vterm-decoder-decode-fn dec) bv i len)] ))

(define (make-ascii-decoder)
  (make-vterm-decoder
    'ascii
    #f
    (lambda (bv pos len)
      (let loop ([cpl '()] [bc 0] [i pos])
        (if (< i (+ pos len))
            (let* ([byte (bytevector-u8-ref bv i)]
                   [byte (bitwise-xor byte (bitwise-and #x80 byte))])
              (if (or (< byte #x20) (= byte #x7f) (>= byte #x80))
                  (values cpl bc)
                  (loop (append cpl (list byte)) (+ bc 1) (+ i 1) )))
            (let ()
              (values cpl bc)))) )))

(define (make-utf8-decoder)
  (let ([bytes-remaining 0]
        [bytes-total 0]
        [this-cp 0])
  (make-vterm-decoder
    'utf8
    ;; init
    (lambda ()
      (set! bytes-remaining 0)
      (set! bytes-total 0)
      (set! this-cp 0))
    ;; decode
    (lambda (bv pos len)
      (let loop ([cpl '()] [i pos] [end? #f])
        (if (and (not end?) (< i (+ pos len)))
            (call/cc (lambda (return)
            (let* ([byte (bytevector-u8-ref bv i)])
              (cond
                ((fx<? byte #x20)
                 (return cpl (fx- i pos)))

                ((eqv? byte #x7f)
                 (return cpl (fx- i pos)))

                ((and (>= byte #x20) (< byte #x7f))
                 (when (positive? bytes-remaining)
                   (set! cpl (append cpl (list UNICODE-INVALID))))
                 (set! cpl (append cpl (list byte)))
                 (set! bytes-remaining 0))

                ((and (>= byte #x80) (< byte #xc0))
                 (if (eqv? bytes-remaining 0)
                     (loop (append cpl (list UNICODE-INVALID)) (fx+ i 1) #f)
                   (let ()
                     (set! this-cp (bitwise-arithmetic-shift this-cp 6))
                     (set! this-cp (bitwise-ior this-cp (bitwise-and byte #x3f)))
                     (set! bytes-remaining (fx- bytes-remaining 1))
                     (when (eqv? bytes-remaining 0)
                       ;; Check for overlong sequences
                       (case bytes-total
                         ((2)
                          (when (fx<? this-cp #x0080)
                            (set! this-cp UNICODE-INVALID)))
                         ((3)
                          (when (fx<? this-cp #x0800)
                            (set! this-cp UNICODE-INVALID)))
                         ((4)
                          (when (fx<? this-cp #x10000)
                            (set! this-cp UNICODE-INVALID)))
                         ((5)
                          (when (fx<? this-cp #x200000)
                            (set! this-cp UNICODE-INVALID)))
                         ((6)
                          (when (fx<? this-cp #x400000)
                            (set! this-cp UNICODE-INVALID))) )
                       ;; Now look for plain invalid ones
                       (when (or (and (fx>=? this-cp #xD800) (fx<=? this-cp #xDFFF))
                                 (fx=? this-cp #xFFFE)
                                 (fx=? this-cp #xFFFF))
                         (set! this-cp UNICODE-INVALID))
                       (set! cpl (append cpl (list this-cp))))
                   )))

                ((and (fx>=? byte #xc0) (< byte #xe0))
                 (when (positive? bytes-remaining)
                   (set! cpl (append cpl (list UNICODE-INVALID))))
                 (set! this-cp (bitwise-and byte #x1f))
                 (set! bytes-total 2)
                 (set! bytes-remaining 1))

                ((and (fx>=? byte #xe0) (fx<? byte #xf0))
                 (when (positive? bytes-remaining)
                   (set! cpl (append cpl (list UNICODE-INVALID))))
                 (set! this-cp (bitwise-and byte #x0f))
                 (set! bytes-total 3)
                 (set! bytes-remaining 2))

                ((and (fx>=? byte #xf0) (fx<? byte #xf8))
                 (when (positive? bytes-remaining)
                   (set! cpl (append cpl (list UNICODE-INVALID))))
                 (set! this-cp (bitwise-and byte #x07))
                 (set! bytes-total 4)
                 (set! bytes-remaining 3))

                ((and (fx>=? byte #xf8) (fx<? byte #xfc))
                 (when (positive? bytes-remaining)
                   (set! cpl (append cpl (list UNICODE-INVALID))))
                 (set! this-cp (bitwise-and byte #x03))
                 (set! bytes-total 5)
                 (set! bytes-remaining 4))

                ((and (fx>=? byte #xfc) (fx<? byte #xfe))
                 (when (positive? bytes-remaining)
                   (set! cpl (append cpl (list UNICODE-INVALID))))
                 (set! this-cp (bitwise-and byte #x01))
                 (set! bytes-total 6)
                 (set! bytes-remaining 5))

                (else
                  (set! cpl (append cpl (list UNICODE-INVALID))))
              )
            (loop cpl (+ i 1) #f))
          )) ;; call/cc
          (values cpl (fx- i pos)))) ))))

)
