Skip site navigation (1) Skip section navigation (2)

Peripheral Links

Header And Logo

PostgreSQL
| The world's most advanced open source database.

Site Navigation

Search for
  Advanced Search

patch: support unicode characters above U+10000



This patch adds support for translating UTF-8 representations of unicode characters above U+10000 into UTF-16 surrogate pairs. Once the server supports these characters (see recent discussion on -hackers), the driver should be able to process them without problems (in theory..).

This translation behaviour is the same as what (at least) 1.4 does when decoding UTF-8 via a String ctor. To actually handle the resulting surrogate pairs properly throughout the system you need a 1.5 JDK. See http://java.sun.com/developer/technicalArticles/Intl/Supplementary/ for some background.

I also added checks for illegal encodings in the decoder, and added more testcases for the decoder since I've broken it once before..

Along the way I did some microbenchmarking of the decoder against 1.4.2 client and server JVMs. It's still substantially faster to use our own decoder here rather than use the String ctor (factor of 2 difference). The new checks for illegal encodings add about a 10-15% overhead.

-O
Index: org/postgresql/core/Encoding.java
===================================================================
RCS file: /usr/local/cvsroot/pgjdbc/pgjdbc/org/postgresql/core/Encoding.java,v
retrieving revision 1.16
diff -u -c -r1.16 Encoding.java
*** org/postgresql/core/Encoding.java	17 Jul 2004 07:39:41 -0000	1.16
--- org/postgresql/core/Encoding.java	8 Aug 2004 23:00:50 -0000
***************
*** 261,268 ****
  
   	/**
   	 * Custom byte[] -> String conversion routine for UTF-8 only.
! 	 * This is about 30% faster than using the String(byte[],int,int,String)
! 	 * ctor, at least under JDK 1.4.2.
  	 *
  	 * @param data the array containing UTF8-encoded data
  	 * @param offset the offset of the first byte in <code>data</code> to decode from
--- 261,270 ----
  
   	/**
   	 * Custom byte[] -> String conversion routine for UTF-8 only.
! 	 * This is about twice as fast as using the String(byte[],int,int,String)
! 	 * ctor, at least under JDK 1.4.2. The extra checks for illegal representations
! 	 * add about 10-15% overhead but seem worth it given the number of SQL_ASCII
! 	 * databases out there..
  	 *
  	 * @param data the array containing UTF8-encoded data
  	 * @param offset the offset of the first byte in <code>data</code> to decode from
***************
*** 270,276 ****
  	 * @return a decoded string
  	 * @throws IOException if something goes wrong
   	 */
! 	private synchronized String decodeUTF8(byte[] data, int offset, int length) throws IOException {
  		char[] cdata = decoderArray;
  		if (cdata.length < length)
  			cdata = decoderArray = new char[length];
--- 272,278 ----
  	 * @return a decoded string
  	 * @throws IOException if something goes wrong
   	 */
! 	public synchronized String decodeUTF8(byte[] data, int offset, int length) throws IOException {
  		char[] cdata = decoderArray;
  		if (cdata.length < length)
  			cdata = decoderArray = new char[length];
***************
*** 282,309 ****
  		try {
  			while (in < end) {
  				int ch = data[in++] & 0xff;
  				if (ch < 0x80) {
! 					// Length 1: \u00000 .. \u0007f
  				} else if (ch < 0xe0) { 
! 					// Length 2: \u00080 .. \u007ff
  					ch = ((ch & 0x1f) << 6);
  					ch = ch | (data[in++] & 0x3f);
! 				} else {
! 					// Length 3: \u00800 .. \u0ffff
  					ch = ((ch & 0x0f) << 12);
  					ch = ch | ((data[in++] & 0x3f) << 6);
  					ch = ch | (data[in++] & 0x3f);
  				}
- 				cdata[out++] = (char)ch;
  			}
  		} catch (ArrayIndexOutOfBoundsException a) {
! 			throw new IOException("UTF-8 string representation was truncated");
  		}
! 
  		// Check if we ran past the end without seeing an exception.
  		if (in > end)
! 			throw new IOException("UTF-8 string representation was truncated");
! 
  		return new String(cdata, 0, out);
  	}
  
--- 284,389 ----
  		try {
  			while (in < end) {
  				int ch = data[in++] & 0xff;
+ 
+ 				// Convert UTF-8 to 31-bit codepoint.
  				if (ch < 0x80) {
! 					// 0xxxxxxx -- length 1.
! 				} else if (ch < 0xc0) {
! 					// 10xxxxxx -- illegal!
! 					throw new IOException("Illegal UTF-8 input (initial byte is 10xxxxxx)");
  				} else if (ch < 0xe0) { 
! 					// 110xxxxx 10xxxxxx
  					ch = ((ch & 0x1f) << 6);
+ 					if ((data[in] & 0xc0) != 0x80)
+ 						throw new IOException("Illegal UTF-8 input (byte 2 of 2 not 10xxxxxx)");
  					ch = ch | (data[in++] & 0x3f);
! 				} else if (ch < 0xf0) {
! 					// 1110xxxx 10xxxxxx 10xxxxxx
  					ch = ((ch & 0x0f) << 12);
+ 					if ((data[in] & 0xc0) != 0x80)
+ 						throw new IOException("Illegal UTF-8 input (byte 2 of 3 not 10xxxxxx)");
+ 					ch = ch | ((data[in++] & 0x3f) << 6);
+ 					if ((data[in] & 0xc0) != 0x80)
+ 						throw new IOException("Illegal UTF-8 input (byte 3 of 3 not 10xxxxxx)");
+ 					ch = ch | (data[in++] & 0x3f);
+ 				} else if (ch < 0xf8) {
+ 					// 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+ 					ch = ((ch & 0x07) << 18);
+ 					if ((data[in] & 0xc0) != 0x80)
+ 						throw new IOException("Illegal UTF-8 input (byte 2 of 4 not 10xxxxxx)");
+ 					ch = ch | ((data[in++] & 0x3f) << 12);
+ 					if ((data[in] & 0xc0) != 0x80)
+ 						throw new IOException("Illegal UTF-8 input (byte 3 of 4 not 10xxxxxx)");
+ 					ch = ch | ((data[in++] & 0x3f) << 6);
+ 					if ((data[in] & 0xc0) != 0x80)
+ 						throw new IOException("Illegal UTF-8 input (byte 4 of 4 not 10xxxxxx)");
+ 					ch = ch | (data[in++] & 0x3f);
+ 				} else if (ch < 0xfc) {
+ 					// 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+ 					// nb: should never happen in theory, but might as well accept it anyway --
+ 					// perhaps something is generating non-minimal UTF-8 output.
+ 					ch = ((ch & 0x03) << 24);
+ 					if ((data[in] & 0xc0) != 0x80)
+ 						throw new IOException("Illegal UTF-8 input (byte 2 of 5 not 10xxxxxx)");
+ 					ch = ch | ((data[in++] & 0x3f) << 18);
+ 					if ((data[in] & 0xc0) != 0x80)
+ 						throw new IOException("Illegal UTF-8 input (byte 3 of 5 not 10xxxxxx)");
+ 					ch = ch | ((data[in++] & 0x3f) << 12);
+ 					if ((data[in] & 0xc0) != 0x80)
+ 						throw new IOException("Illegal UTF-8 input (byte 4 of 5 not 10xxxxxx)");
+ 					ch = ch | ((data[in++] & 0x3f) << 6);
+ 					if ((data[in] & 0xc0) != 0x80)
+ 						throw new IOException("Illegal UTF-8 input (byte 5 of 5 not 10xxxxxx)");
+ 					ch = ch | (data[in++] & 0x3f);
+ 				} else if (ch < 0xfe) {
+ 					// 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+ 					// nb: should never happen in theory, but might as well accept it anyway --
+ 					// perhaps something is generating non-minimal UTF-8 output.
+ 					ch = ((ch & 0x01) << 30);
+ 					if ((data[in] & 0xc0) != 0x80)
+ 						throw new IOException("Illegal UTF-8 input (byte 2 of 6 not 10xxxxxx)");
+ 					ch = ch | ((data[in++] & 0x3f) << 24);
+ 					if ((data[in] & 0xc0) != 0x80)
+ 						throw new IOException("Illegal UTF-8 input (byte 3 of 6 not 10xxxxxx)");
+ 					ch = ch | ((data[in++] & 0x3f) << 18);
+ 					if ((data[in] & 0xc0) != 0x80)
+ 						throw new IOException("Illegal UTF-8 input (byte 4 of 6 not 10xxxxxx)");
+ 					ch = ch | ((data[in++] & 0x3f) << 12);
+ 					if ((data[in] & 0xc0) != 0x80)
+ 						throw new IOException("Illegal UTF-8 input (byte 5 of 6 not 10xxxxxx)");
  					ch = ch | ((data[in++] & 0x3f) << 6);
+ 					if ((data[in] & 0xc0) != 0x80)
+ 						throw new IOException("Illegal UTF-8 input (byte 6 of 6 not 10xxxxxx)");
  					ch = ch | (data[in++] & 0x3f);
+ 				} else {
+ 					throw new IOException("Illegal UTF-8 input (initial byte is 1111111x)");
+ 				}
+ 
+ 				// Convert 31-bit codepoint to UTF-16
+ 				if (ch > 0x10ffff)
+ 					throw new IOException("Illegal UTF-8 input (final value out of range: " + ch + ")");
+ 				
+ 				if (ch > 0xffff) {
+ 					// Use a surrogate pair to represent it.
+ 					ch -= 0x10000;  // ch is now 0..fffff (20 bits)
+ 					cdata[out++] = (char) (0xd800 + (ch >> 10));   // top 10 bits
+ 					cdata[out++] = (char) (0xdc00 + (ch & 0x3ff)); // bottom 10 bits
+ 				} else if (ch >= 0xd800 && ch < 0xe000) {
+ 					// Not allowed to encode the surrogate range directly.
+ 					throw new IOException("Illegal UTF-8 input (final value is a surrogate value: " + ch + ")");
+ 				} else {
+ 					// Normal case.
+ 					cdata[out++] = (char) ch;
  				}
  			}
  		} catch (ArrayIndexOutOfBoundsException a) {
! 			throw new IOException("UTF-8 input was truncated");
  		}
! 		
  		// Check if we ran past the end without seeing an exception.
  		if (in > end)
! 			throw new IOException("UTF-8 input was truncated");
! 		
  		return new String(cdata, 0, out);
  	}
  
Index: org/postgresql/test/jdbc2/DatabaseEncodingTest.java
===================================================================
RCS file: /usr/local/cvsroot/pgjdbc/pgjdbc/org/postgresql/test/jdbc2/DatabaseEncodingTest.java,v
retrieving revision 1.2
diff -u -c -r1.2 DatabaseEncodingTest.java
*** org/postgresql/test/jdbc2/DatabaseEncodingTest.java	27 Jul 2004 05:03:04 -0000	1.2
--- org/postgresql/test/jdbc2/DatabaseEncodingTest.java	8 Aug 2004 23:00:50 -0000
***************
*** 1,16 ****
  package org.postgresql.test.jdbc2;
  
  import org.postgresql.test.TestUtil;
  import junit.framework.TestCase;
  import java.sql.*;
  
  /*
!  * Test case for Dario's encoding problems.
!  * Ensure the driver's own utf-8 decode method works.
   */
  public class DatabaseEncodingTest extends TestCase
  {
  	private Connection con;
  
  	public DatabaseEncodingTest(String name)
  	{
--- 1,23 ----
  package org.postgresql.test.jdbc2;
  
  import org.postgresql.test.TestUtil;
+ import org.postgresql.core.Encoding;
+ import java.io.IOException;
+ import java.util.Arrays;
  import junit.framework.TestCase;
  import java.sql.*;
  
  /*
!  * Test case for various encoding problems.
!  *
!  * Ensure that we can do a round-trip of all server-supported unicode
!  * values without trashing them, and that bad character encodings are
!  * detected.
   */
  public class DatabaseEncodingTest extends TestCase
  {
  	private Connection con;
+ 	private final Encoding utf8Encoding = Encoding.getJVMEncoding("UTF-8");
  
  	public DatabaseEncodingTest(String name)
  	{
***************
*** 66,73 ****
  		rs.close();
  
  		// Create data.
! 		// NB: we only test up to d800 as code points above that are
! 		// reserved for surrogates in UTF-16
  		PreparedStatement insert = con.prepareStatement("INSERT INTO testdbencoding(unicode_ordinal, unicode_string) VALUES (?,?)");		
  		for (int i = 1; i < 0xd800; i += STEP) {
  			int count = (i+STEP) > 0xd800 ? 0xd800-i : STEP;
--- 73,81 ----
  		rs.close();
  
  		// Create data.
! 		// NB: we avoid d800-dfff as that range is reserved for surrogates in UTF-16.
! 		// We also do not test codepoints above U+10000 as the server doesn't correctly
! 		// support them (yet).
  		PreparedStatement insert = con.prepareStatement("INSERT INTO testdbencoding(unicode_ordinal, unicode_string) VALUES (?,?)");		
  		for (int i = 1; i < 0xd800; i += STEP) {
  			int count = (i+STEP) > 0xd800 ? 0xd800-i : STEP;
***************
*** 82,87 ****
--- 90,108 ----
  			assertEquals(1, insert.executeUpdate());
  		}
  
+ 		for (int i = 0xe000; i < 0x10000; i += STEP) {
+ 			int count = (i+STEP) > 0x10000 ? 0x10000-i : STEP;
+ 			char[] testChars = new char[count];
+ 			for (int j = 0; j < count; ++j)
+ 				testChars[j] = (char)(i+j);
+ 			
+ 			String testString = new String(testChars);
+ 			
+ 			insert.setInt(1, i);
+ 			insert.setString(2, testString);
+ 			assertEquals(1, insert.executeUpdate());
+ 		}
+ 
  		con.commit();
  			
  		// Check data.
***************
*** 99,103 ****
--- 120,267 ----
  			
  			assertEquals(dumpString(testString), dumpString(rs.getString(2)));
  		}
+ 
+ 		for (int i = 0xe000; i < 0x10000; i += STEP) {
+ 			assertTrue(rs.next());
+ 			assertEquals(i, rs.getInt(1));
+ 			
+ 			int count = (i+STEP) > 0x10000 ? 0x10000-i : STEP;
+ 			char[] testChars = new char[count];
+ 			for (int j = 0; j < count; ++j)
+ 				testChars[j] = (char)(i+j);
+ 			
+ 			String testString = new String(testChars);
+ 			
+ 			assertEquals(dumpString(testString), dumpString(rs.getString(2)));
+ 		}
+ 	}
+ 
+ 	public void testUTF8Decode() throws Exception {
+ 		// Tests for our custom UTF-8 decoder.
+ 
+ 		for (int ch = 0; ch < 0x110000; ++ch) {
+ 			if (ch >= 0xd800 && ch < 0xe000)
+ 				continue; // Surrogate range.
+ 
+ 			String testString;
+ 			if (ch >= 0x10000) {
+ 				testString = new String(new char[] {
+ 											(char) (0xd800 + ((ch-0x10000) >> 10)),
+ 											(char) (0xdc00 + ((ch-0x10000) & 0x3ff)) });
+ 			} else {
+ 				testString = new String(new char[] { (char)ch });
+ 			}
+ 
+ 			byte[] jvmEncoding = testString.getBytes("UTF-8");
+ 			String jvmDecoding = new String(jvmEncoding, 0, jvmEncoding.length, "UTF-8");
+ 			String ourDecoding = utf8Encoding.decode(jvmEncoding, 0, jvmEncoding.length);
+ 
+ 			assertEquals(testString, jvmDecoding);
+ 			assertEquals(testString, ourDecoding);
+ 		}
+ 	}
+ 
+ 	public void testBadUTF8Decode() throws Exception {
+ 		byte[][] badSequences = new byte[][] {
+ 			// One-byte illegal sequences
+ 			{ (byte)0x80 }, // First byte may not be 10xxxxxx
+ 
+ 			// Two-byte illegal sequences
+ 			{ (byte)0xc0, (byte)0x00 },  // Second byte must be 10xxxxxx
+ 
+ 			// Three-byte illegal sequences
+ 			{ (byte)0xe0, (byte)0x00 },  // Second byte must be 10xxxxxx
+ 			{ (byte)0xe0, (byte)0x80, (byte)0x00 },  // Third byte must be 10xxxxxx
+ 			{ (byte)0xed, (byte)0xa0, (byte)0x80 },  // Not allowed to encode the range d800..dfff
+ 
+ 			// Four-byte illegal sequences
+ 			{ (byte)0xf0, (byte)0x00 },  // Second byte must be 10xxxxxx
+ 			{ (byte)0xf0, (byte)0x80, (byte)0x00 },  // Third byte must be 10xxxxxx
+ 			{ (byte)0xf0, (byte)0x80, (byte)0x80, (byte)0x00 },  // Fourth byte must be 10xxxxxx
+ 
+ 			// Five-byte illegal sequences
+ 			{ (byte)0xf8, (byte)0x00 },  // Second byte must be 10xxxxxx
+ 			{ (byte)0xf8, (byte)0x80, (byte)0x00 },  // Third byte must be 10xxxxxx
+ 			{ (byte)0xf8, (byte)0x80, (byte)0x80, (byte)0x00 },  // Fourth byte must be 10xxxxxx
+ 			{ (byte)0xf8, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x00 },  // Fifth byte must be 10xxxxxx
+ 			{ (byte)0xf8, (byte)0x88, (byte)0x80, (byte)0x80, (byte)0x80 },  // Resulting value must be < U+110000
+ 
+ 			// Six-byte illegal sequences
+ 			{ (byte)0xfc, (byte)0x00 },  // Second byte must be 10xxxxxx
+ 			{ (byte)0xfc, (byte)0x80, (byte)0x00 },  // Third byte must be 10xxxxxx
+ 			{ (byte)0xfc, (byte)0x80, (byte)0x80, (byte)0x00 },  // Fourth byte must be 10xxxxxx
+ 			{ (byte)0xfc, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x00 },  // Fifth byte must be 10xxxxxx
+ 			{ (byte)0xfc, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x00 },  // Sixth byte must be 10xxxxxx
+ 			{ (byte)0xfc, (byte)0x80, (byte)0x88, (byte)0x80, (byte)0x80, (byte)0x80 },  // Resulting value must be < U+110000
+ 			
+ 			// Seven-byte illegal sequences
+ 			{ (byte)0xfe }, // Can't have a seven-byte sequence.
+ 
+ 			// Eigth-byte illegal sequences
+ 			{ (byte)0xff }, // Can't have an eight-byte sequence.
+ 		};
+ 
+ 		byte[] paddedSequence = new byte[32];
+ 		for (int i = 0; i < badSequences.length; ++i) {
+ 			byte[] sequence = badSequences[i];
+ 			
+ 			try {
+ 				String str = utf8Encoding.decode(sequence, 0, sequence.length);
+ 				fail("Expected an IOException on sequence " + i + ", but decoded to <" + str + ">");
+ 			} catch (IOException ioe) {}
+ 			
+ 			// Try it with padding.
+ 			Arrays.fill(paddedSequence, (byte)0);
+ 			System.arraycopy(sequence, 0, paddedSequence, 0, sequence.length);
+ 
+ 			try {
+ 				String str = utf8Encoding.decode(paddedSequence, 0, paddedSequence.length);
+ 				fail("Expected an IOException on sequence " + i + ", but decoded to <" + str + ">");
+ 			} catch (IOException ioe) {}
+ 		}
+ 	}
+ 
+ 	public void testTruncatedUTF8Decode() throws Exception {
+ 		byte[][] shortSequences = new byte[][] {
+ 			{ (byte)0xc0 },              // Second byte must be present
+ 
+ 			{ (byte)0xe0 },              // Second byte must be present
+ 			{ (byte)0xe0, (byte)0x80 },  // Third byte must be present
+ 
+ 			{ (byte)0xf0 },              // Second byte must be present
+ 			{ (byte)0xf0, (byte)0x80 },  // Third byte must be present
+ 			{ (byte)0xf0, (byte)0x80, (byte)0x80 },  // Fourth byte must be present
+ 
+ 			{ (byte)0xfc },              // Second byte must be present
+ 			{ (byte)0xfc, (byte)0x80 },  // Third byte must be present
+ 			{ (byte)0xfc, (byte)0x80, (byte)0x80 },  // Fourth byte must be present
+ 			{ (byte)0xfc, (byte)0x80, (byte)0x80, (byte)0x80 },  // Fifth byte must be present
+ 			{ (byte)0xfc, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80 },  // Sixth byte must be present
+ 
+ 			{ (byte)0xf8 },              // Second byte must be present
+ 			{ (byte)0xf8, (byte)0x80 },  // Third byte must be present
+ 			{ (byte)0xf8, (byte)0x80, (byte)0x80 },  // Fourth byte must be present
+ 			{ (byte)0xf8, (byte)0x80, (byte)0x80, (byte)0x80 },  // Fifth byte must be present
+ 		};
+ 
+ 		byte[] paddedSequence = new byte[32];
+ 		for (int i = 0; i < shortSequences.length; ++i) {
+ 			byte[] sequence = shortSequences[i];
+ 			
+ 			try {
+ 				String str = utf8Encoding.decode(sequence, 0, sequence.length);
+ 				fail("Expected an IOException on sequence " + i + ", but decoded to <" + str + ">");
+ 			} catch (IOException ioe) {}
+ 
+ 			
+ 			// Try it with padding and a truncated length.
+ 			Arrays.fill(paddedSequence, (byte)0);
+ 			System.arraycopy(sequence, 0, paddedSequence, 0, sequence.length);
+ 			
+ 			try {
+ 				String str = utf8Encoding.decode(paddedSequence, 0, sequence.length);
+ 				fail("Expected an IOException on sequence " + i + ", but decoded to <" + str + ">");
+ 			} catch (IOException ioe) {}
+ 		}
  	}
  }


Home | Main Index | Thread Index

Privacy Policy | PostgreSQL Archives hosted by Command Prompt, Inc. | Designed by tinysofa
Copyright © 1996 – 2008 PostgreSQL Global Development Group