- It now rejects overlong UTF-8 byte sequences, except for the frequently used two byte overlong sequence for the null character.
- It now rejects surrogate characters in UTF-8 sequences.
- In some cases, it was returning incorrect values for incomplete UTF-8 sequences (the cPartial argument). This has now been corrected.
- lAllowASCII allows the input string to be all ASCII
- lAllowPartial allows the input string to end with an incomplete UTF-8 sequence
- cPartial (passed by reference) is set to the incomplete UTF-8 sequence at the end of the input string, or the empty string
http://kevincarmody.com/hmg/SOURCE/h_UNICODE_String.prg - lines 175-300
Code: Select all
// Following function modified by Kevin Carmody, October 2015
FUNCTION HMG_IsUTF8( cString, lAllowASCII, lAllowPartial, cPartial )
/*
Modeled after HB_STRISUTF8 in \src\rtl\strutf8.c in Harbour source and
is_utf8() posted at http://stackoverflow.com/questions/1031645/how-to-detect-utf-8-in-plain-c
HB_STRISUTF8 has several bugs:
1. It does not accept a pure ASCII string.
2. It does not accept the empty string.
3. It accepts code points outside of Unicode range.
4. It accepts overlong UTF-8 sequences.
5. It accepts surrogate characters.
This function returns .F. if cString contains any invalid UTF-8. It also
accepts the 2-byte overlong sequence for the null character.
If the optional argument lAllowASCII is .T., cString may be all ASCII.
Otherwise cString must contain one or more non-ASCII chars.
If the optional argument lAllowPartial is .T., cString may end with an
unfinished UTF-8 byte sequence, which is passed back through cPartial,
which is otherwise set to the empty string. This is useful when cString
is a file buffer.
*/
LOCAL lASCII := .T.
LOCAL lCheck := .F.
LOCAL lUTF8 := .T.
LOCAL nCBytes := 0
LOCAL nRBytes := 0
LOCAL cChar, nChar, nLead
IF lAllowASCII == NIL
lAllowASCII := .F.
ENDIF
IF lAllowPartial == NIL
lAllowPartial := .F.
ENDIF
BEGIN SEQUENCE
FOR EACH cChar IN cString
nChar := HB_BCODE( cChar )
IF nCBytes > 0 // check continuation bytes
IF nChar < 0x80 .OR. nChar > 0xBF // disallow invalid continuation byte
BREAK
ENDIF
IF lCheck // check first continuation byte for partially valid lead byte
SWITCH nLead
CASE 0xC0 // disallow 2-byte overlongs except overlong null character
IF nChar != 0x80
BREAK
ENDIF
EXIT
CASE 0xE0 // disallow 3-byte overlongs
IF nChar < 0xA0
BREAK
ENDIF
EXIT
CASE 0xED // disallow surrogates
IF nChar > 0x9F
BREAK
ENDIF
EXIT
CASE 0xF0 // disallow 4-byte overlongs
IF nChar < 0x90
BREAK
ENDIF
EXIT
CASE 0xF4 // disallow 4-byte sequences beyond end of Unicode
IF nChar > 0x8F
BREAK
ENDIF
EXIT
ENDSWITCH
lCheck := .F.
ENDIF
nCBytes --
nRBytes ++
ELSEIF nChar >= 0x80 // check lead byte
lASCII := .F.
nLead := nChar
IF nLead < 0xC0 .OR. nLead == 0xC1 .OR. nLead > 0xF4 // disallow invalid lead bytes
BREAK
ENDIF
lCheck := ( nLead == 0xC0 .OR. nLead == 0xE0 .OR. nLead == 0xED .OR. ;
nLead == 0xF0 .OR. nLead == 0xF4 ) // partially valid lead bytes
DO CASE // compute number of continuation bytes
CASE nLead <= 0xDF
nCBytes := 1
CASE nLead <= 0xEF
nCBytes := 2
OTHERWISE
nCBytes := 3
ENDCASE
nRBytes := 1
ENDIF
NEXT
RECOVER
lUTF8 := .F.
END SEQUENCE
IF lUTF8 .AND. nCBytes > 0
IF lAllowPartial
cPartial := RIGHT( cString, nRBytes )
ELSE
lUTF8 := .F.
ENDIF
ELSE
IF lAllowPartial
cPartial := ''
ENDIF
ENDIF
IF ! lAllowASCII .AND. lASCII
lUTF8 := .F.
ENDIF
RETURN lUTF8