convert_utf.h

Go to the documentation of this file.
00001 
00002 /*
00003  * Copyright 2001-2004 Unicode, Inc.
00004  *
00005  * Disclaimer
00006  *
00007  * This source code is provided as is by Unicode, Inc. No claims are
00008  * made as to fitness for any particular purpose. No warranties of any
00009  * kind are expressed or implied. The recipient agrees to determine
00010  * applicability of information provided. If this file has been
00011  * purchased on magnetic or optical media from Unicode, Inc., the
00012  * sole remedy for any claim will be exchange of defective media
00013  * within 90 days of receipt.
00014  *
00015  * Limitations on Rights to Redistribute This Code
00016  *
00017  * Unicode, Inc. hereby grants the right to freely use the information
00018  * supplied in this file in the creation of products supporting the
00019  * Unicode Standard, and to make copies of this file in any form
00020  * for internal or external distribution as long as this notice
00021  * remains attached.
00022  */
00023 
00024 /* ---------------------------------------------------------------------
00025 
00026     Conversions between UTF32, UTF-16, and UTF-8.  Header file.
00027 
00028     Several funtions are included here, forming a complete set of
00029     conversions between the three formats.  UTF-7 is not included
00030     here, but is handled in a separate source file.
00031 
00032     Each of these routines takes pointers to input buffers and output
00033     buffers.  The input buffers are const.
00034 
00035     Each routine converts the text between *sourceStart and sourceEnd,
00036     putting the result into the buffer between *targetStart and
00037     targetEnd. Note: the end pointers are *after* the last item: e.g. 
00038     *(sourceEnd - 1) is the last item.
00039 
00040     The return result indicates whether the conversion was successful,
00041     and if not, whether the problem was in the source or target buffers.
00042     (Only the first encountered problem is indicated.)
00043 
00044     After the conversion, *sourceStart and *targetStart are both
00045     updated to point to the end of last text successfully converted in
00046     the respective buffers.
00047 
00048     Input parameters:
00049     sourceStart - pointer to a pointer to the source buffer.
00050         The contents of this are modified on return so that
00051         it points at the next thing to be converted.
00052     targetStart - similarly, pointer to pointer to the target buffer.
00053     sourceEnd, targetEnd - respectively pointers to the ends of the
00054         two buffers, for overflow checking only.
00055 
00056     These conversion functions take a ConversionFlags argument. When this
00057     flag is set to strict, both irregular sequences and isolated surrogates
00058     will cause an error.  When the flag is set to lenient, both irregular
00059     sequences and isolated surrogates are converted.
00060 
00061     Whether the flag is strict or lenient, all illegal sequences will cause
00062     an error return. This includes sequences such as: <F4 90 80 80>, <C0 80>,
00063     or <A0> in UTF-8, and values above 0x10FFFF in UTF-32. Conformant code
00064     must check for illegal sequences.
00065 
00066     When the flag is set to lenient, characters over 0x10FFFF are converted
00067     to the replacement character; otherwise (when the flag is set to strict)
00068     they constitute an error.
00069 
00070     Output parameters:
00071     The value "sourceIllegal" is returned from some routines if the input
00072     sequence is malformed.  When "sourceIllegal" is returned, the source
00073     value will point to the illegal value that caused the problem. E.g.,
00074     in UTF-8 when a sequence is malformed, it points to the start of the
00075     malformed sequence.  
00076 
00077     Author: Mark E. Davis, 1994.
00078     Rev History: Rick McGowan, fixes & updates May 2001.
00079          Fixes & updates, Sept 2001.
00080 
00081 ------------------------------------------------------------------------ */
00082 
00083 /* ---------------------------------------------------------------------
00084     The following 4 definitions are compiler-specific.
00085     The C standard does not guarantee that wchar_t has at least
00086     16 bits, so wchar_t is no less portable than unsigned short!
00087     All should be unsigned values to avoid sign extension during
00088     bit mask & shift operations.
00089 ------------------------------------------------------------------------ */
00090 
00091 #ifndef CONVERT_UTF_H_
00092 #define CONVERT_UTF_H_
00093 
00097 
00098 typedef unsigned long   UTF32;  /* at least 32 bits */
00099 typedef unsigned short  UTF16;  /* at least 16 bits */
00100 typedef unsigned char   UTF8;   /* typically 8 bits */
00101 //typedef unsigned char Boolean; /* 0 or 1 */
00102 
00103 /* Some fundamental constants */
00104 #define UNI_REPLACEMENT_CHAR (UTF32)0x0000FFFD
00105 #define UNI_MAX_BMP (UTF32)0x0000FFFF
00106 #define UNI_MAX_UTF16 (UTF32)0x0010FFFF
00107 #define UNI_MAX_UTF32 (UTF32)0x7FFFFFFF
00108 #define UNI_MAX_LEGAL_UTF32 (UTF32)0x0010FFFF
00109 
00110 typedef enum {
00111     conversionOK,       /* conversion successful */
00112     sourceExhausted,    /* partial character in source, but hit end */
00113     targetExhausted,    /* insuff. room in target for conversion */
00114     sourceIllegal       /* source sequence is illegal/malformed */
00115 } ConversionResult;
00116 
00117 typedef enum {
00118     strictConversion = 0,
00119     getConvertedSize,
00120     lenientConversion
00121 } ConversionFlags;
00122 
00123 /* This is for C++ and does no harm in C */
00124 #ifdef __cplusplus
00125 extern "C" {
00126 #endif
00127 
00128 ConversionResult ConvertUTF8toUTF16 (
00129         const UTF8** sourceStart, const UTF8* sourceEnd, 
00130         UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags);
00131 
00132 ConversionResult ConvertUTF16toUTF8 (
00133         const UTF16** sourceStart, const UTF16* sourceEnd, 
00134         UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags);
00135         
00136 ConversionResult ConvertUTF8toUTF32 (
00137         const UTF8** sourceStart, const UTF8* sourceEnd, 
00138         UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags);
00139 
00140 ConversionResult ConvertUTF32toUTF8 (
00141         const UTF32** sourceStart, const UTF32* sourceEnd, 
00142         UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags);
00143         
00144 ConversionResult ConvertUTF16toUTF32 (
00145         const UTF16** sourceStart, const UTF16* sourceEnd, 
00146         UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags);
00147 
00148 ConversionResult ConvertUTF32toUTF16 (
00149         const UTF32** sourceStart, const UTF32* sourceEnd, 
00150         UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags);
00151 
00152 bool IsLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd);
00153 
00154 #ifdef __cplusplus
00155 }
00156 #endif
00157 
00158 #endif // CONVERT_UTF_H_
00159 
00160 /* --------------------------------------------------------------------- */ 

Generated on Tue Jan 6 22:41:12 2009 for Autodesk DWF Whip 2D Toolkit by  doxygen 1.4.5