Logo Search packages:      
Sourcecode: inkscape version File versions  Download package

enum CRStatus cr_utils_read_char_from_utf8_buf ( const guchar *  a_in,
gulong  a_in_len,
guint32 *  a_out,
gulong *  a_consumed 
)

Reads a character from an utf8 buffer. Actually decode the next character code (unicode character code) and returns it.

Parameters:
a_in the starting address of the utf8 buffer.
a_in_len the length of the utf8 buffer.
a_out output parameter. The resulting read char.
a_consumed the number of the bytes consumed to decode the returned character code.
Returns:
CR_OK upon successfull completion, an error code otherwise.

Definition at line 428 of file cr-utils.c.

Referenced by cr_input_peek_char(), and cr_input_read_char().

{
        gulong in_len = 0,
                in_index = 0,
                nb_bytes_2_decode = 0;
        enum CRStatus status = CR_OK;

        /*
         *to store the final decoded 
         *unicode char
         */
        guint32 c = 0;

        g_return_val_if_fail (a_in && a_out && a_out
                              && a_consumed, CR_BAD_PARAM_ERROR);

        if (a_in_len < 1) {
                status = CR_OK;
                goto end;
        }

        in_len = a_in_len;

        if (*a_in <= 0x7F) {
                /*
                 *7 bits long char
                 *encoded over 1 byte:
                 * 0xxx xxxx
                 */
                c = *a_in;
                nb_bytes_2_decode = 1;

        } else if ((*a_in & 0xE0) == 0xC0) {
                /*
                 *up to 11 bits long char.
                 *encoded over 2 bytes:
                 *110x xxxx  10xx xxxx
                 */
                c = *a_in & 0x1F;
                nb_bytes_2_decode = 2;

        } else if ((*a_in & 0xF0) == 0xE0) {
                /*
                 *up to 16 bit long char
                 *encoded over 3 bytes:
                 *1110 xxxx  10xx xxxx  10xx xxxx
                 */
                c = *a_in & 0x0F;
                nb_bytes_2_decode = 3;

        } else if ((*a_in & 0xF8) == 0xF0) {
                /*
                 *up to 21 bits long char
                 *encoded over 4 bytes:
                 *1111 0xxx  10xx xxxx  10xx xxxx  10xx xxxx
                 */
                c = *a_in & 0x7;
                nb_bytes_2_decode = 4;

        } else if ((*a_in & 0xFC) == 0xF8) {
                /*
                 *up to 26 bits long char
                 *encoded over 5 bytes.
                 *1111 10xx  10xx xxxx  10xx xxxx  
                 *10xx xxxx  10xx xxxx
                 */
                c = *a_in & 3;
                nb_bytes_2_decode = 5;

        } else if ((*a_in & 0xFE) == 0xFC) {
                /*
                 *up to 31 bits long char
                 *encoded over 6 bytes:
                 *1111 110x  10xx xxxx  10xx xxxx  
                 *10xx xxxx  10xx xxxx  10xx xxxx
                 */
                c = *a_in & 1;
                nb_bytes_2_decode = 6;

        } else {
                /*BAD ENCODING */
                goto end;
        }

        if (nb_bytes_2_decode > a_in_len) {
                status = CR_END_OF_INPUT_ERROR;
                goto end;
        }

        /*
         *Go and decode the remaining byte(s)
         *(if any) to get the current character.
         */
        for (in_index = 1; in_index < nb_bytes_2_decode; in_index++) {
                /*byte pattern must be: 10xx xxxx */
                if ((a_in[in_index] & 0xC0) != 0x80) {
                        goto end;
                }

                c = (c << 6) | (a_in[in_index] & 0x3F);
        }

        /*
         *The decoded ucs4 char is now
         *in c.
         */

    /************************
     *Some security tests
     ***********************/

        /*be sure c is a char */
        if (c == 0xFFFF || c == 0xFFFE)
                goto end;

        /*be sure c is inferior to the max ucs4 char value */
        if (c > 0x10FFFF)
                goto end;

        /*
         *c must be less than UTF16 "lower surrogate begin"
         *or higher than UTF16 "High surrogate end"
         */
        if (c >= 0xD800 && c <= 0xDFFF)
                goto end;

        /*Avoid characters that equals zero */
        if (c == 0)
                goto end;

        *a_out = c;

      end:
        *a_consumed = nb_bytes_2_decode;

        return status;
}


Generated by  Doxygen 1.6.0   Back to index