Delphi: Punicode Decode
Нашел информацию, закодированную в Punycode: кириллическое доменное имя, но как мне декодировать обратно (punycode на кириллицу)?
3 ответа
Код, на который вы ссылаетесь, не работает. PunycodeDecode
функция нарушена. Строка 416 гласит:
move(output[i], output[i + 1], (outidx - i) * SizeOf(output^));
Это неверный перевод с C. Вместо этого он должен читать:
move(output[i], output[i + 1], (outidx - i) * SizeOf(output^[0]));
После внесения этого изменения я успешно протестировал следующую программу:
program Punycode;
{$APPTYPE CONSOLE}
uses
SysUtils,
dorPunyCode in 'dorPunyCode.pas';
function PEncode(const str: UnicodeString): AnsiString;
var
len: Cardinal;
begin
if str = '' then
begin
Result := '';
exit;
end;
if (PunycodeEncode(Length(str), PPunyCode(str), len) = pcSuccess) and (Length(str) + 1 <> len) then
begin
SetLength(Result, len);
PunycodeEncode(Length(str), PPunyCode(str), len, PByte(Result));
end else
Result := str;
end;
function PDecode(const str: AnsiString): UnicodeString;
var
outputlen: Cardinal;
begin
if str = '' then
begin
Result := '';
exit;
end;
outputlen := 0;
if (PunycodeDecode(Length(str), PByte(str), outputlen) = pcSuccess) and (Length(str) + 1 <> outputlen) then
begin
SetLength(Result, outputlen);
PunycodeDecode(Length(str), PByte(str), outputlen, PPunycode(Result));
end else
Result := str;
end;
procedure Test(const Input: UnicodeString);
begin
if PDecode(PEncode(Input))<>Input then
raise EAssertionFailed.CreateFmt('Round-trip failed: %s', [Input]);
end;
begin
Test('http://президент.рф/');
Test('David Heffernan');
Test('');
Test('A');
end.
Тем не менее, я ничего не знаю о Punycode и, в частности, я понятия не имею, какой смысл 'xn--'
префикс, который добавляется в вопросе, с которым вы связаны. Итак PEncode
а также PDecode
процедуры, которые я показываю выше, вероятно, не совсем то, что вам нужно.
Я чрезвычайно сомневаюсь в коде Delphi в dorPunyCode
Я подозреваю, что есть другие проблемы, скрывающиеся. На твоем месте я бы заполучил punycode.c
, скомпилируйте это с bcc32
а затем ссылку на него с $L
, Я бы посчитал это гораздо более надежным, чем это dorPunyCode
Блок.
В этом коде есть еще несколько ошибок.
Вот тест для него:
program PunyCodeTest;
uses
Vcl.Dialogs,
SysUtils,
PunyCode in '..\SRC\PunyCode.pas';
type
TCodecTestRec = record
Decoded: AnsiString;
Encoded: AnsiString;
end;
TDomainTestRec = record
Decoded: WideString;
Encoded: AnsiString;
end;
const
CodecTestCases: array [0..19] of TCodecTestRec = (
// My samples
// ----------
// 蒙古火锅-test
(Decoded: 'u+8499 u+53E4 u+706B u+9505 u+002D u+0074 u+0065 u+0073 u+0074';
Encoded: '-test-xt8h571o0z7ad54a'),
// RFC 3492 - 7.1 Sample strings
// -----------------------------
// (A) Arabic (Egyptian):
(Decoded: 'u+0644 u+064A u+0647 u+0645 u+0627 u+0628 u+062A u+0643 u+0644 '+
'u+0645 u+0648 u+0634 u+0639 u+0631 u+0628 u+064A u+061F';
Encoded: 'egbpdaj6bu4bxfgehfvwxn'),
//(B) Chinese (simplified):
(Decoded: 'u+4ED6 u+4EEC u+4E3A u+4EC0 u+4E48 u+4E0D u+8BF4 u+4E2D u+6587';
Encoded: 'ihqwcrb4cv8a8dqg056pqjye'),
// (C) Chinese (traditional):
(Decoded: 'u+4ED6 u+5011 u+7232 u+4EC0 u+9EBD u+4E0D u+8AAA u+4E2D u+6587';
Encoded: 'ihqwctvzc91f659drss3x8bo0yb'),
// (D) Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
(Decoded: 'U+0050 u+0072 u+006F u+010D u+0070 u+0072 u+006F u+0073 u+0074 '+
'u+011B u+006E u+0065 u+006D u+006C u+0075 u+0076 u+00ED u+010D u+0065 '+
'u+0073 u+006B u+0079';
Encoded: 'Proprostnemluvesky-uyb24dma41a'),
// (E) Hebrew:
(Decoded: 'u+05DC u+05DE u+05D4 u+05D4 u+05DD u+05E4 u+05E9 u+05D5 u+05D8 '+
'u+05DC u+05D0 u+05DE u+05D3 u+05D1 u+05E8 u+05D9 u+05DD u+05E2 u+05D1 '+
'u+05E8 u+05D9 u+05EA';
Encoded: '4dbcagdahymbxekheh6e0a7fei0b'),
// (F) Hindi (Devanagari):
(Decoded: 'u+092F u+0939 u+0932 u+094B u+0917 u+0939 u+093F u+0928 u+094D '+
'u+0926 u+0940 u+0915 u+094D u+092F u+094B u+0902 u+0928 u+0939 u+0940 '+
'u+0902 u+092C u+094B u+0932 u+0938 u+0915 u+0924 u+0947 u+0939 u+0948 '+
'u+0902';
Encoded: 'i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd'),
// (G) Japanese (kanji and hiragana):
(Decoded: 'u+306A u+305C u+307F u+3093 u+306A u+65E5 u+672C u+8A9E u+3092 '+
'u+8A71 u+3057 u+3066 u+304F u+308C u+306A u+3044 u+306E u+304B';
Encoded: 'n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa'),
// (H) Korean (Hangul syllables):
(Decoded: 'u+C138 u+ACC4 u+C758 u+BAA8 u+B4E0 u+C0AC u+B78C u+B4E4 u+C774 '+
'u+D55C u+AD6D u+C5B4 u+B97C u+C774 u+D574 u+D55C u+B2E4 u+BA74 u+C5BC '+
'u+B9C8 u+B098 u+C88B u+C744 u+AE4C';
Encoded: '989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5jpsd879ccm6fea98c'),
// (I) Russian (Cyrillic):
(Decoded: 'U+043F u+043E u+0447 u+0435 u+043C u+0443 u+0436 u+0435 u+043E '+
'u+043D u+0438 u+043D u+0435 u+0433 u+043E u+0432 u+043E u+0440 u+044F '+
'u+0442 u+043F u+043E u+0440 u+0443 u+0441 u+0441 u+043A u+0438';
Encoded: 'b1abfaaepdrnnbgefbaDotcwatmq2g4l'),
// (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
(Decoded: 'U+0050 u+006F u+0072 u+0071 u+0075 u+00E9 u+006E u+006F u+0070 '+
'u+0075 u+0065 u+0064 u+0065 u+006E u+0073 u+0069 u+006D u+0070 u+006C '+
'u+0065 u+006D u+0065 u+006E u+0074 u+0065 u+0068 u+0061 u+0062 u+006C '+
'u+0061 u+0072 u+0065 u+006E U+0045 u+0073 u+0070 u+0061 u+00F1 u+006F '+
'u+006C';
Encoded: 'PorqunopuedensimplementehablarenEspaol-fmd56a'),
// (K) Vietnamese:
// T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
// <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
(Decoded: 'U+0054 u+1EA1 u+0069 u+0073 u+0061 u+006F u+0068 u+1ECD u+006B '+
'u+0068 u+00F4 u+006E u+0067 u+0074 u+0068 u+1EC3 u+0063 u+0068 u+1EC9 '+
'u+006E u+00F3 u+0069 u+0074 u+0069 u+1EBF u+006E u+0067 U+0056 u+0069 '+
'u+1EC7 u+0074';
Encoded: 'TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g'),
// (L) 3<nen>B<gumi><kinpachi><sensei>
(Decoded: 'u+0033 u+5E74 U+0042 u+7D44 u+91D1 u+516B u+5148 u+751F';
Encoded: '3B-ww4c5e180e575a65lsy2b'),
// (M) <amuro><namie>-with-SUPER-MONKEYS
(Decoded: 'u+5B89 u+5BA4 u+5948 u+7F8E u+6075 u+002D u+0077 u+0069 u+0074 '+
'u+0068 u+002D U+0053 U+0055 U+0050 U+0045 U+0052 u+002D U+004D U+004F '+
'U+004E U+004B U+0045 U+0059 U+0053';
Encoded: '-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n'),
// (N) Hello-Another-Way-<sorezore><no><basho>
(Decoded: 'U+0048 u+0065 u+006C u+006C u+006F u+002D U+0041 u+006E u+006F '+
'u+0074 u+0068 u+0065 u+0072 u+002D U+0057 u+0061 u+0079 u+002D u+305D '+
'u+308C u+305E u+308C u+306E u+5834 u+6240';
Encoded: 'Hello-Another-Way--fc4qua05auwb3674vfr0b'),
// (O) <hitotsu><yane><no><shita>2
(Decoded: 'u+3072 u+3068 u+3064 u+5C4B u+6839 u+306E u+4E0B u+0032';
Encoded: '2-u9tlzr9756bt3uc0v'),
// (P) Maji<de>Koi<suru>5<byou><mae>
(Decoded: 'U+004D u+0061 u+006A u+0069 u+3067 U+004B u+006F u+0069 u+3059 '+
'u+308B u+0035 u+79D2 u+524D';
Encoded: 'MajiKoi5-783gue6qz075azm5e'),
// (Q) <pafii>de<runba>
(Decoded: 'u+30D1 u+30D5 u+30A3 u+30FC u+0064 u+0065 u+30EB u+30F3 u+30D0';
Encoded: 'de-jg4avhby1noc0d'),
// (R) <sono><supiido><de>
(Decoded: 'u+305D u+306E u+30B9 u+30D4 u+30FC u+30C9 u+3067';
Encoded: 'd9juau41awczczp'),
// (S) -> $1.00 <-
(Decoded: 'u+002D u+003E u+0020 u+0024 u+0031 u+002E u+0030 u+0030 u+0020 '+
'u+003C u+002D';
Encoded: '-> $1.00 <--')
);
DomainTestCases: array [0..2] of TDomainTestRec = (
(Decoded: '蒙古火锅-test.cn'; Encoded: 'xn---test-xt8h571o0z7ad54a.cn'),
(Decoded: 'президент.рф'; Encoded: 'xn--d1abbgf6aiiy.xn--p1ai'),
(Decoded: 'почта.мд'; Encoded: 'xn--80a1acny.xn--d1ap')
);
const
UnicodeMaxLength = 256;
AceMaxLength = 256;
function ExtractCodePoint(S: AnsiString; var Off: Integer; out caseflag: Byte; out codept: TPunyCode): Boolean;
procedure InvalidInput;
begin
raise EAssertionFailed.Create('Invalid input');
end;
begin
Result := False;
if Off > Length(S) then Exit;
if Off+5 > Length(S) then InvalidInput;
if S[Off] = 'u' then caseflag := 0
else if S[Off] = 'U' then caseflag := 1
else InvalidInput;
Inc(Off);
if S[Off] <> '+' then InvalidInput;
Inc(Off);
codept := StrToInt('$'+Copy(S, Off, 4));
Inc(Off, 4);
if (Off <= Length(S)) and (S[Off] <> ' ') then InvalidInput;
Inc(Off);
Result := True;
end;
procedure TestEncoder(Decoded, Encoded: AnsiString);
var
inlen, outlen: Cardinal;
caseflags: array [0..UnicodeMaxLength-1] of Byte;
input: array [0..UnicodeMaxLength-1] of TPunyCode;
output: array [0..AceMaxLength-1] of Byte;
Off: Integer;
caseflag: Byte;
codept: TPunyCode;
begin
inlen := 0;
Off := 1;
while ExtractCodePoint(Decoded, Off, caseflag, codept) do
begin
caseflags[inlen] := caseflag;
input[inlen] := codept;
Inc(inlen);
end;
outlen := AceMaxLength;
if (PunycodeEncode(inlen, @input, outlen, @output, @caseflags) <> pcSuccess) or
(outlen <> Cardinal(Length(Encoded))) or
not CompareMem(@output, @Encoded[1], outlen) then
raise EAssertionFailed.CreateFmt('Encoding failed: %s', [Decoded]);
end;
function MakeDecoded(outlen: Cardinal; output: PPunycode; caseflags: PByte): AnsiString;
var
_caseflags: PByteArray absolute caseflags;
I: Integer;
S: AnsiString;
begin
Result := '';
for I := 0 to outlen - 1 do
begin
if _caseflags[I] = 0 then S := 'u+'
else S := 'U+';
S := S + IntToHex(output[I], 4);
if Result = '' then Result := S
else Result := Result + ' ' + S;
end;
end;
procedure TestDecoder(Decoded, Encoded: AnsiString);
var
inlen, outlen: Cardinal;
caseflags: array [0..UnicodeMaxLength-1] of Byte;
output: array [0..UnicodeMaxLength-1] of TPunyCode;
begin
inlen := Length(Encoded);
if inlen > AceMaxLength then
raise EAssertionFailed.CreateFmt('Input is too big: %s', [Encoded]);
outlen := UnicodeMaxLength;
if (PunycodeDecode(inlen, PByte(@Encoded[1]), outlen, @output, @caseflags) <> pcSuccess) or
(MakeDecoded(outlen, @output, @caseflags) <> Decoded) then
raise EAssertionFailed.CreateFmt('Decoding failed: %s', [Encoded]);
end;
procedure TestCodec(Decoded, Encoded: AnsiString);
begin
TestEncoder(Decoded, Encoded);
TestDecoder(Decoded, Encoded);
end;
procedure TestCodecTestCases;
var
I: Integer;
begin
for I := 0 to High(CodecTestCases) do
TestCodec(CodecTestCases[I].Decoded, CodecTestCases[I].Encoded);
end;
procedure TestDomain(Decoded: WideString; Encoded: AnsiString);
begin
if PunycodeEncodeDomain(Decoded) <> Encoded then
raise EAssertionFailed.CreateFmt('Encoding failed: %s', [Decoded]);
if PunycodeDecodeDomain(Encoded) <> Decoded then
raise EAssertionFailed.CreateFmt('Decoding failed: %s', [Encoded]);
end;
procedure TestDomainTestCases;
var
I: Integer;
begin
for I := 0 to High(DomainTestCases) do
TestDomain(DomainTestCases[I].Decoded, DomainTestCases[I].Encoded);
end;
begin
TestCodecTestCases;
TestDomainTestCases;
MessageDlg('Punycode was successfully tested', mtInformation, [mbOK], 0);
end.
А здесь исправлена версия с добавленным PunycodeDecodeDomain. Протестировано в Dephi 2007 и Delphi XE3 (x86 и x64):
(*
* punycode.c from RFC 3492prop
* http://www.nicemice.net/idn/
* Adam M. Costello
* http://www.nicemice.net/amc/
*
* This is ANSI C code (C89) implementing Punycode (RFC 3492prop).
* Delphi Conversion by:
* Henri Gourvest <hgourvest@gmail.com>
* http://www.progdigy.com
* contributor
* J. Heffernan <info@heffs.org.uk>
* testing, fixing and refactoring
* Igor Tsurcanovsky <Igor@ritlabs.com>
usage:
function PEncode(const str: UnicodeString): AnsiString;
var
len: Cardinal;
begin
if str = '' then
begin
Result := '';
exit;
end;
if (PunycodeEncode(Length(str), PPunyCode(str), len) = pcSuccess) and (Length(str) + 1 <> len) then
begin
SetLength(Result, len);
PunycodeEncode(Length(str), PPunyCode(str), len, PByte(Result));
end else
Result := str;
end;
function PDecode(const str: AnsiString): UnicodeString;
var
outputlen: Cardinal;
begin
if str = '' then
begin
Result := '';
exit;
end;
outputlen := 0;
if (PunycodeDecode(Length(str), PByte(str), outputlen) = pcSuccess) and (Length(str) <> outputlen) then
begin
SetLength(Result, outputlen);
PunycodeDecode(Length(str), PByte(str), outputlen, PPunycode(Result));
end else
Result := str;
end;
procedure Test(const Input: UnicodeString);
begin
if PDecode(PEncode(Input))<>Input then
raise EAssertionFailed.CreateFmt('Round-trip failed: %s', [Input]);
end;
begin
Test('президент');
Test('David Heffernan');
Test('');
Test('A');
end.
*)
unit PunyCode;
interface
type
{$if (SizeOf(Char) = 1)}
// For compatibility with versions without UnicodeString (prior Delphi 2009)
UnicodeString = WideString;
{$ifend}
TPunyCodeStatus = (
pcSuccess,
pcBadInput, (* Input is invalid. *)
pcBigOutput, (* Output would exceed the space provided. *)
pcOverflow (* Input needs wider integers to process. *)
);
TPunyCode = Word;
TPunyCodeArray = array[0..(High(Integer) div SizeOf(TPunyCode)) - 1] of TPunyCode;
PPunycode = ^TPunyCodeArray;
function PunycodeDecode(inputlen: Cardinal; const input: PByte;
var outputlen: Cardinal; output: PPunycode = nil;
caseflags: PByte = nil): TPunyCodeStatus;
function PunycodeEncode(inputlen: Cardinal; const input: PPunycode;
var outputlen: Cardinal; const output: PByte = nil;
const caseflags: PByte = nil): TPunyCodeStatus; overload;
function PunycodeDecodeDomain(const str: AnsiString): UnicodeString;
function PunycodeEncodeDomain(const str: UnicodeString): AnsiString;
implementation
uses SysUtils;
type
PByteArray = ^TByteArray;
TByteArray = array [0..MaxInt-1] of Byte;
(*** Bootstring parameters for Punycode ***)
const
PUNY_BASE = 36;
PUNY_TMIN = 1;
PUNY_TMAX = 26;
PUNY_SKEW = 38;
PUNY_DAMP = 700;
PUNY_INITIAL_BIAS = 72;
PUNY_INITIAL_N = $80;
PUNY_DELIMITER = $2D;
// typedef unsigned int punycode_uint;
// /* maxint is the maximum value of a punycode_uint variable: */
// static const punycode_uint maxint = -1;
// /* Because maxint is unsigned, -1 becomes the maximum value. */
PUNY_maxint = High(Cardinal);
(* flagged(bcp) tests whether a basic code point is flagged *)
(* (uppercase). The behavior is undefined if bcp is not a *)
(* basic code point. *)
function PUNY_flagged(bcp: Cardinal): Byte; inline;
begin
Result := Ord(bcp - 65 < 26);
end;
(* DecodeDigit(cp) returns the numeric value of a basic code *)
(* point (for use in representing integers) in the range 0 to *)
(* BASE-1, or BASE if cp is does not represent a value. *)
function PUNY_DecodeDigit(cp: Cardinal): Cardinal; inline;
begin
if (cp - 48 < 10) then
Result := cp - 22
else if (cp - 65 < 26) then
Result := cp - 65
else if (cp - 97 < 26) then
Result := cp - 97
else
Result := PUNY_BASE;
end;
(* EncodeDigit(d,flag) returns the basic code point whose value *)
(* (when used for representing integers) is d, which needs to be in *)
(* the range 0 to BASE-1. The lowercase form is used unless flag is *)
(* nonzero, in which case the uppercase form is used. The behavior *)
(* is undefined if flag is nonzero and digit d has no uppercase form. *)
function PUNY_EncodeDigit(d: Cardinal; flag: Boolean): Byte; inline;
begin
Result := d + 22 + 75 * Ord(d < 26) - (Ord(flag) shl 5);
(* 0..25 map to ASCII a..z or A..Z *)
(* 26..35 map to ASCII 0..9 *)
end;
(* EncodeBasic(bcp,flag) forces a basic code point to lowercase *)
(* if flag is zero, uppercase if flag is nonzero, and returns *)
(* the resulting code point. The code point is unchanged if it *)
(* is caseless. The behavior is undefined if bcp is not a basic *)
(* code point. *)
function PUNY_EncodeBasic(bcp: Cardinal; flag: Integer): Byte; inline;
begin
Dec(bcp, Ord(bcp - 97 < 26) shl 5);
Result := bcp + (((not flag) and Ord(bcp - 65 < 26)) shl 5);
end;
(*** Bias adaptation function ***)
function PUNY_Adapt(delta, numpoints: Cardinal; firsttime: Boolean): Cardinal; inline;
var
k: TPunyCode;
begin
if firsttime then
delta := delta div PUNY_DAMP
else
delta := delta shr 1;
(* delta shr 1 is a faster way of doing delta div 2 *)
Inc(delta, delta div numpoints);
k := 0;
while (delta > ((PUNY_BASE - PUNY_TMIN) * PUNY_TMAX) div 2) do
begin
delta := delta div (PUNY_BASE - PUNY_TMIN);
Inc(k, PUNY_BASE);
end;
Result := k + (PUNY_BASE - PUNY_TMIN + 1) * delta div (delta + PUNY_SKEW);
end;
(* PunycodeEncode() converts Unicode to Punycode. The input *)
(* is represented as an array of Unicode code points (not code *)
(* units; surrogate pairs are not allowed), and the output *)
(* will be represented as an array of ASCII code points. The *)
(* output string is *not* null-terminated; it will contain *)
(* zeros if and only if the input contains zeros. (Of course *)
(* the caller can leave room for a terminator and add one if *)
(* needed.) The inputlen is the number of code points in *)
(* the input. The outputlen is an in/out argument: the *)
(* caller passes in the maximum number of code points that it *)
(* can receive, and on successful return it will contain the *)
(* number of code points actually output. The case_flags array *)
(* holds input_length boolean values, where nonzero suggests that *)
(* the corresponding Unicode character be forced to uppercase *)
(* after being decoded (if possible), and zero suggests that *)
(* it be forced to lowercase (if possible). ASCII code points *)
(* are encoded literally, except that ASCII letters are forced *)
(* to uppercase or lowercase according to the corresponding *)
(* uppercase flags. If case_flags is a null pointer then ASCII *)
(* letters are left as they are, and other code points are *)
(* treated as if their uppercase flags were zero. The return *)
(* value can be any of the TPunyCodeStatus values defined above *)
(* except pcBadInput; if not pcSuccess, then *)
(* output_size and output might contain garbage. *)
function PunycodeEncode(inputlen: Cardinal; const input: PPunycode;
var outputlen: Cardinal; const output: PByte = nil;
const caseflags: PByte = nil): TPunyCodeStatus;
var
outidx, maxout, n, delta, h, b, bias, m, q, k, t: Cardinal;
j: Integer;
_output: PByteArray absolute output;
_caseflags: PByteArray absolute caseflags;
begin
(* Initialize the state: *)
n := PUNY_INITIAL_N;
outidx := 0;
delta := outidx;
maxout := outputlen;
bias := PUNY_INITIAL_BIAS;
(* Handle the basic code points: *)
for j := 0 to inputlen - 1 do
begin
if (input[j] < $80) then
begin
if (output <> nil) then
begin
if (maxout - outidx < 2) then
begin
Result := pcBigOutput;
Exit;
end;
if (caseflags <> nil) then
_output[outidx] := PUNY_EncodeBasic(input[j], _caseflags[j])
else
_output[outidx] := input[j];
end;
Inc(outidx);
end;
(* else if (input[j] < n) return pcBadInput; *)
(* (not needed for Punycode with unsigned code points) *)
end;
b := outidx;
h := b;
(* h is the number of code points that have been handled, b is the *)
(* number of basic code points, and out is the number of characters *)
(* that have been output. *)
if (b > 0) then
begin
if (output <> nil) then
_output[outidx] := PUNY_DELIMITER;
Inc(outidx);
end;
(* Main encoding loop: *)
while (h < inputlen) do
begin
(* All non-basic code points < n have been *)
(* handled already. Find the next larger one: *)
m := PUNY_maxint;
for j := 0 to inputlen - 1 do
(* if (basic(input[j])) continue; *)
(* (not needed for Punycode) *)
if ((input[j] >= n) and (input[j] < m)) then
m := input[j];
(* Increase delta enough to advance the decoder's *)
(* <n,i> state to <m,0>, but guard against overflow: *)
if (m - n > (PUNY_maxint - delta) div (h + 1)) then
begin
Result := pcOverflow;
Exit;
end;
Inc(delta, (m - n) * (h + 1));
n := m;
for j := 0 to inputlen - 1 do
begin
(* Punycode does not need to check whether input[j] is basic: *)
if (input[j] < n (* or basic(input[j]) *) ) then
begin
Inc(delta);
if (delta = 0) then
begin
Result := pcOverflow;
Exit;
end;
end;
if (input[j] = n) then
begin
(* Represent delta as a generalized variable-length integer: *)
q := delta;
k := PUNY_BASE;
while true do
begin
if (output <> nil) then
if (outidx >= maxout) then
begin
Result := pcBigOutput;
Exit;
end;
if k <= bias (* + TMIN *) then (* +TMIN not needed *)
t := PUNY_TMIN
else if k >= bias + PUNY_TMAX then
t := PUNY_TMAX
else
t := k - bias;
if (q < t) then
break;
if (output <> nil) then
_output[outidx] := PUNY_EncodeDigit(t + (q - t) mod (PUNY_BASE - t), False);
Inc(outidx);
q := (q - t) div (PUNY_BASE - t);
Inc(k, PUNY_BASE);
end;
if (output <> nil) then
_output[outidx] := PUNY_EncodeDigit(q,
(caseflags <> nil) and (_caseflags[j] <> 0));
Inc(outidx);
bias := PUNY_Adapt(delta, h + 1, h = b);
delta := 0;
Inc(h);
end;
end;
Inc(delta);
Inc(n);
end;
outputlen := outidx;
Result := pcSuccess;
end;
(* PunycodeDecode() converts Punycode to Unicode. The input is *)
(* represented as an array of ASCII code points, and the output *)
(* will be represented as an array of Unicode code points. The *)
(* input_length is the number of code points in the input. The *)
(* output_length is an in/out argument: the caller passes in *)
(* the maximum number of code points that it can receive, and *)
(* on successful return it will contain the actual number of *)
(* code points output. The case_flags array needs room for at *)
(* least output_length values, or it can be a null pointer if the *)
(* case information is not needed. A nonzero flag suggests that *)
(* the corresponding Unicode character be forced to uppercase *)
(* by the caller (if possible), while zero suggests that it be *)
(* forced to lowercase (if possible). ASCII code points are *)
(* output already in the proper case, but their flags will be set *)
(* appropriately so that applying the flags would be harmless. *)
(* The return value can be any of the TPunyCodeStatus values *)
(* defined above; if not pcSuccess, then output_length, *)
(* output, and case_flags might contain garbage. On success, the *)
(* decoder will never need to write an output_length greater than *)
(* input_length, because of how the encoding is defined. *)
function PunycodeDecode(inputlen: Cardinal; const input: PByte;
var outputlen: Cardinal; output: PPunycode;
caseflags: PByte): TPunyCodeStatus;
var
outidx, i, maxout, bias, b, inidx, oldi, w, k, digit, t, n : Cardinal;
j: Integer;
_input: PByteArray absolute input;
_caseflags: PByteArray absolute caseflags;
begin
(* Initialize the state: *)
n := PUNY_INITIAL_N;
outidx := 0;
i := outidx;
maxout := outputlen;
bias := PUNY_INITIAL_BIAS;
(* Handle the basic code points: Let b be the number of input code *)
(* points before the last DELIMITER, or 0 if there is none, then *)
(* copy the first b code points to the output. *)
b := 0;
for j := 0 to inputlen - 1 do
if _input[j] = PUNY_DELIMITER then
b := j;
if output <> nil then
if (b > maxout) then
begin
Result := pcBigOutput;
Exit;
end;
for j := 0 to b - 1 do
begin
if (caseflags <> nil) then
_caseflags[outidx] := PUNY_flagged(_input[j]);
if (_input[j] >= $80) then
begin
Result := pcBadInput;
Exit;
end;
if output <> nil then
output[outidx] := _input[j];
Inc(outidx);
end;
(* Main decoding loop: Start just after the last DELIMITER if any *)
(* basic code points were copied; start at the beginning otherwise. *)
if (b > 0) then
inidx := b + 1
else
inidx := 0;
while inidx < inputlen do
begin
(* in is the index of the next character to be consumed, and *)
(* out is the number of code points in the output array. *)
(* Decode a generalized variable-length integer into delta, *)
(* which gets added to i. The overflow checking is easier *)
(* if we increase i as we go, then subtract off its starting *)
(* value at the end to obtain delta. *)
oldi := i;
w := 1;
k := PUNY_BASE;
while true do
begin
if (inidx >= inputlen) then
begin
Result := pcBadInput;
Exit;
end;
digit := PUNY_DecodeDigit(_input[inidx]);
Inc(inidx);
if (digit >= PUNY_BASE) then
begin
Result := pcBadInput;
Exit;
end;
if (digit > (PUNY_maxint - i) div w) then
begin
Result := pcOverflow;
Exit;
end;
Inc(i, digit * w);
if k <= bias (* + TMIN *) then
t := PUNY_TMIN
else (* +TMIN not needed *)
if k >= bias + PUNY_TMAX then
t := PUNY_TMAX
else
t := k - bias;
if (digit < t) then
break;
if (w > (PUNY_maxint div (PUNY_BASE - t))) then
begin
Result := pcOverflow;
Exit;
end;
w := w * (PUNY_BASE - t);
Inc(k, PUNY_BASE);
end;
bias := PUNY_Adapt(i - oldi, outidx + 1, oldi = 0);
(* i was supposed to wrap around from out+1 to 0, *)
(* incrementing n each time, so we'll fix that now: *)
if (i div (outidx + 1) > PUNY_maxint - n) then
begin
Result := pcOverflow;
Exit;
end;
Inc(n, i div (outidx + 1));
i := i mod (outidx + 1);
(* Insert n at position i of the output: *)
(* not needed for Punycode: *)
(* if (DecodeDigit(n) <= BASE) return punycode_invalid_input; *)
if output <> nil then
if (outidx >= maxout) then
begin
Result := pcBigOutput;
Exit;
end;
if (caseflags <> nil) then
begin
move(_caseflags[i], _caseflags[i + 1], outidx - i);
(* Case of last character determines uppercase flag: *)
_caseflags[i] := PUNY_flagged(_input[inidx - 1]);
end;
if output <> nil then
begin
move(output[i], output[i + 1], (outidx - i) * SizeOf(TPunyCode));
output[i] := n;
end;
Inc(i);
Inc(outidx);
end;
outputlen := outidx;
Result := pcSuccess;
end;
function PunycodeDecodeDomain(const str: AnsiString): UnicodeString;
var
p, s: PAnsiChar;
procedure DoIt(dot: Boolean);
var
inlen, outlen: Cardinal;
unicode: UnicodeString;
u: PWideChar;
begin
inlen := p - s;
if (inlen > 4) and (StrLIComp(s, 'xn--', 4) = 0) and
(PunycodeDecode(inlen-4, PByte(@s[4]), outlen) = pcSuccess) then
begin
if dot then
SetLength(unicode, outlen + 1)
else
SetLength(unicode, outlen);
u := PWideChar(unicode);
PunycodeDecode(inlen-4, PByte(@s[4]), outlen, PPunyCode(u));
if dot then
begin
inc(u, outlen);
u^ := '.';
end;
end else
if dot then
SetString(unicode, s, inlen + 1)
else
SetString(unicode, s, inlen);
Result := Result + unicode;
end;
begin
Result := '';
p := PAnsiChar(str);
s := p;
while True do
case p^ of
'.':
begin
DoIt(True);
Inc(p);
s := p;
end;
#0 :
begin
DoIt(False);
Break;
end;
else
Inc(p);
end;
end;
function PunycodeEncodeDomain(const str: UnicodeString): AnsiString;
var
p, s: PWideChar;
procedure DoIt(dot: Boolean);
var
inlen, outlen: Cardinal;
ansi: AnsiString;
a: PAnsiChar;
begin
inlen := p - s;
if (PunycodeEncode(inlen, PPunyCode(s), outlen) = pcSuccess) and (inlen + 1 <> outlen) then
begin
if dot then
SetLength(ansi, outlen + 4 + 1)
else
SetLength(ansi, outlen + 4);
a := PAnsiChar(ansi);
Move(PAnsiChar('xn--')^, a^, 4);
inc(a, 4);
PunycodeEncode(inlen, PPunyCode(s), outlen, PByte(a));
if dot then
begin
inc(a, outlen);
a^ := '.';
end;
end else
if dot then
SetString(ansi, s, inlen + 1)
else
SetString(ansi, s, inlen);
Result := Result + ansi;
end;
begin
Result := '';
p := PWideChar(str);
s := p;
while True do
case p^ of
'.':
begin
DoIt(True);
Inc(p);
s := p;
end;
#0 :
begin
DoIt(False);
Break;
end;
else
Inc(p);
end;
end;
end.