Taylan Ulrich Bayırlı/Kammer
2017-03-11 12:19:44 UTC
See the R6RS Libraries document page 10. The differences:
- R6RS supports reading a BOM.
- R6RS mandates an endianness argument to specify the behavior at the
absence of a BOM.
- R6RS allows an optional third argument 'endianness-mandatory' to
explicitly ignore any possible BOM.
Here's a quick patch on top of master. I didn't test it thoroughly...
===File
/home/taylan/src/guile/guile-master/0001-Fix-R6RS-utf16-string-and-utf32-string.patch===
From f51cd1d4884caafb1ed0072cd77c0e3145f34576 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Taylan=20Ulrich=20Bay=C4=B1rl=C4=B1/Kammer?=
<***@gmail.com>
Date: Fri, 10 Mar 2017 22:36:55 +0100
Subject: [PATCH] Fix R6RS utf16->string and utf32->string.
* module/rnrs/bytevectors.scm (read-bom16, read-bom32): New procedures.
(r6rs-utf16->string, r6rs-utf32->string): Ditto.
---
module/rnrs/bytevectors.scm | 52 ++++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 51 insertions(+), 1 deletion(-)
diff --git a/module/rnrs/bytevectors.scm b/module/rnrs/bytevectors.scm
index 9744359f0..997a8c9cb 100644
--- a/module/rnrs/bytevectors.scm
+++ b/module/rnrs/bytevectors.scm
@@ -69,7 +69,9 @@
bytevector-ieee-double-native-set!
string->utf8 string->utf16 string->utf32
- utf8->string utf16->string utf32->string))
+ utf8->string
+ (r6rs-utf16->string . utf16->string)
+ (r6rs-utf32->string . utf32->string)))
(load-extension (string-append "libguile-" (effective-version))
@@ -80,4 +82,52 @@
`(quote ,sym)
(error "unsupported endianness" sym)))
+(define (read-bom16 bv)
+ (let ((c0 (bytevector-u8-ref bv 0))
+ (c1 (bytevector-u8-ref bv 1)))
+ (cond
+ ((and (= c0 #xFE) (= c1 #xFF))
+ 'big)
+ ((and (= c0 #xFF) (= c1 #xFE))
+ 'little)
+ (else
+ #f))))
+
+(define r6rs-utf16->string
+ (case-lambda
+ ((bv default-endianness)
+ (let ((bom-endianness (read-bom16 bv)))
+ (if (not bom-endianness)
+ (utf16->string bv default-endianness)
+ (substring/shared (utf16->string bv bom-endianness) 1))))
+ ((bv endianness endianness-mandatory?)
+ (if endianness-mandatory?
+ (utf16->string bv endianness)
+ (r6rs-utf16->string bv endianness)))))
+
+(define (read-bom32 bv)
+ (let ((c0 (bytevector-u8-ref bv 0))
+ (c1 (bytevector-u8-ref bv 1))
+ (c2 (bytevector-u8-ref bv 2))
+ (c3 (bytevector-u8-ref bv 3)))
+ (cond
+ ((and (= c0 #x00) (= c1 #x00) (= c2 #xFE) (= c3 #xFF))
+ 'big)
+ ((and (= c0 #xFF) (= c1 #xFE) (= c2 #x00) (= c3 #x00))
+ 'little)
+ (else
+ #f))))
+
+(define r6rs-utf32->string
+ (case-lambda
+ ((bv default-endianness)
+ (let ((bom-endianness (read-bom32 bv)))
+ (if (not bom-endianness)
+ (utf32->string bv default-endianness)
+ (substring/shared (utf32->string bv bom-endianness) 1))))
+ ((bv endianness endianness-mandatory?)
+ (if endianness-mandatory?
+ (utf32->string bv endianness)
+ (r6rs-utf32->string bv endianness)))))
+
;;; bytevector.scm ends here
- R6RS supports reading a BOM.
- R6RS mandates an endianness argument to specify the behavior at the
absence of a BOM.
- R6RS allows an optional third argument 'endianness-mandatory' to
explicitly ignore any possible BOM.
Here's a quick patch on top of master. I didn't test it thoroughly...
===File
/home/taylan/src/guile/guile-master/0001-Fix-R6RS-utf16-string-and-utf32-string.patch===
From f51cd1d4884caafb1ed0072cd77c0e3145f34576 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Taylan=20Ulrich=20Bay=C4=B1rl=C4=B1/Kammer?=
<***@gmail.com>
Date: Fri, 10 Mar 2017 22:36:55 +0100
Subject: [PATCH] Fix R6RS utf16->string and utf32->string.
* module/rnrs/bytevectors.scm (read-bom16, read-bom32): New procedures.
(r6rs-utf16->string, r6rs-utf32->string): Ditto.
---
module/rnrs/bytevectors.scm | 52 ++++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 51 insertions(+), 1 deletion(-)
diff --git a/module/rnrs/bytevectors.scm b/module/rnrs/bytevectors.scm
index 9744359f0..997a8c9cb 100644
--- a/module/rnrs/bytevectors.scm
+++ b/module/rnrs/bytevectors.scm
@@ -69,7 +69,9 @@
bytevector-ieee-double-native-set!
string->utf8 string->utf16 string->utf32
- utf8->string utf16->string utf32->string))
+ utf8->string
+ (r6rs-utf16->string . utf16->string)
+ (r6rs-utf32->string . utf32->string)))
(load-extension (string-append "libguile-" (effective-version))
@@ -80,4 +82,52 @@
`(quote ,sym)
(error "unsupported endianness" sym)))
+(define (read-bom16 bv)
+ (let ((c0 (bytevector-u8-ref bv 0))
+ (c1 (bytevector-u8-ref bv 1)))
+ (cond
+ ((and (= c0 #xFE) (= c1 #xFF))
+ 'big)
+ ((and (= c0 #xFF) (= c1 #xFE))
+ 'little)
+ (else
+ #f))))
+
+(define r6rs-utf16->string
+ (case-lambda
+ ((bv default-endianness)
+ (let ((bom-endianness (read-bom16 bv)))
+ (if (not bom-endianness)
+ (utf16->string bv default-endianness)
+ (substring/shared (utf16->string bv bom-endianness) 1))))
+ ((bv endianness endianness-mandatory?)
+ (if endianness-mandatory?
+ (utf16->string bv endianness)
+ (r6rs-utf16->string bv endianness)))))
+
+(define (read-bom32 bv)
+ (let ((c0 (bytevector-u8-ref bv 0))
+ (c1 (bytevector-u8-ref bv 1))
+ (c2 (bytevector-u8-ref bv 2))
+ (c3 (bytevector-u8-ref bv 3)))
+ (cond
+ ((and (= c0 #x00) (= c1 #x00) (= c2 #xFE) (= c3 #xFF))
+ 'big)
+ ((and (= c0 #xFF) (= c1 #xFE) (= c2 #x00) (= c3 #x00))
+ 'little)
+ (else
+ #f))))
+
+(define r6rs-utf32->string
+ (case-lambda
+ ((bv default-endianness)
+ (let ((bom-endianness (read-bom32 bv)))
+ (if (not bom-endianness)
+ (utf32->string bv default-endianness)
+ (substring/shared (utf32->string bv bom-endianness) 1))))
+ ((bv endianness endianness-mandatory?)
+ (if endianness-mandatory?
+ (utf32->string bv endianness)
+ (r6rs-utf32->string bv endianness)))))
+
;;; bytevector.scm ends here
--
2.11.0
============================================================
2.11.0
============================================================