Unicode字符串中的奇数字符

问题描述：

当谈到获得MP3 ID3 v2实现时，我遇到了一些问题。除了这个问题之外，我已经在大部分环境中工作，这可能与此无关。无论如何，我使用下面的代码来处理检索涉及文本的标题标签的数据。Unicode字符串中的奇数字符

我碰到的是（我猜？）我在某些不同的字符串中遇到Unicode字符。我试图将它转换为下面的方式，并且它可以工作。但是，我在字符串之前获得了3美元的价值，之后获得了3美元的3美元。有什么我可以做的下面的代码解析出来，或者我必须自己做？这些文件由iTunes编码，如果有帮助的话。

function Id3v2_string(currp: pointer; datasize: integer): string; 
{ handles string processing for ID3v2 data } 
    const 
    IS_TEXT_UNICODE_UNICODE_MASK = $0F; 
    var 
    outstr: string; 
    uscan: integer; 
    begin 
    outstr := ''; 
    SetLength(outstr, datasize); 
    uscan := IS_TEXT_UNICODE_UNICODE_MASK; 
    if IsTextUnicode(currp, datasize, @uscan) then 
     outstr := WideCharToString(currp) 
    else 
     move(currp^, outstr[1], datasize); 
    Result := outstr; 
    end;

注意，我真的没有兴趣，因为所有我希望做一个媒体库，这是编辑ID3标签，而不是播放文件 - 实施已经完成，除了少数像这样的小问题一。

答

根据正在使用的版本ID3 v2，文本字符串可能会或可能不会以一个字节为前缀来告诉您字符串的实际编码。不要使用IsTextUnicode()来猜测编码是什么（尤其是因为它可以报告false results）。

在ID3 v2高达v2.3，没有编码字节，文本是ISO-8859-1或UCS-2和UCS-2字符串始终以BOM开头，因此您知道字节排序。例如：

// prior to Delphi 2009 - String is Ansi 
function Id3v2_string(currp: Pointer; datasize: Integer): String; 
var 
    W: WideString; 
    I: Integer; 
    Ch: WideChar; 
begin 
    Result := ''; 
    if (datasize >= SizeOf(Word)) and ((PWord(currp)^ = $FEFF) or (PWord(currp)^= $FFFE)) then begin 
    // UCS-2 with BOM 
    W := WideCharLenToString(PWideChar(Integer(currp) + SizeOf(Word)), (datasize - SizeOf(Word)) div SizeOf(WideChar)); 
    if PWord(currp)^ = $FFFE then begin 
     // BE, convert to LE 
     for I := 1 to Length(W) do begin 
     Ch := W[I]; 
     W[I] := WideChar(((Word(Ch) and $FF) shl 8) or (Word(Ch) shr 8)); 
     end; 
    end; 
    end else begin 
    // ISO-8859-1 
    I := MultiByteToWideChar(28591, 0, PAnsiChar(currp), datasize, nil, 0); 
    if I > 0 then begin 
     SetLength(W, I); 
     MultiByteToWideChar(28591, 0, PAnsiChar(currp), datasize, PWideChar(W), I); 
    end; 
    end; 
    Result := TrimRight(W); 
end;

。

// Delphi 2009+ - String is Unicode 
function Id3v2_string(currp: Pointer; datasize: Integer): String; 
var 
    Enc: TEncoding; 

    function Convert(P: Pointer; Size: Integer): String; 
    var 
    Buf: TBytes; 
    begin 
    SetLength(Buf, Size); 
    if Size > 0 then Move(P^, Buf[0], Size); 
    Result := Enc.GetString(Buf); 
    end; 

begin 
    Result := ''; 
    if (datasize >= SizeOf(Word)) and ((PWord(currp)^ = $FEFF) or (PWord(currp)^ = $FFFE)) then begin 
    // UCS-2 with BOM 
    if PWord(currp)^ = $FFFE then begin 
     // BE 
     Enc := TEncoding.BigEndianUnicode; 
    end else begin 
     // LE 
     Enc := TEncoding.Unicode; 
    end; 
    Result := Convert(PWord(currp)+1, datasize - SizeOf(Word)); 
    end else begin 
    // ISO-8859-1 
    Enc := TEncoding.GetEncoding(28591); 
    try 
     Result := Convert(currp, datasize); 
    finally 
     Enc.Free; 
    end; 
    end; 
end;

ID3 V2.4切换UCS-2到UTF-16，并增加了对UTF-8和UTF-16BE支持未经BOM，例如：

// prior to Delphi 2009 - String is Ansi 
function Id3v2_string(currp: Pointer; datasize: Integer; Encoding: Byte): String; 
var 
    W: WideString; 
    I: Integer; 
    Ch: WideChar; 
begin 
    Result := ''; 

    case Encoding of 
    $00: begin 
     // ISO-8859-1 
     I := MultiByteToWideChar(28591, 0, PAnsiChar(currp), datasize, nil, 0); 
     if I > 0 then begin 
     SetLength(W, I); 
     MultiByteToWideChar(28591, 0, PAnsiChar(currp), datasize, PWideChar(W), I); 
     end; 
    end; 
    $01: begin 
     // UTF-16 with BOM 
     SetString(W, PWideChar(Integer(currp) + SizeOf(Word)), (datasize - SizeOf(Word)) div SizeOf(WideChar)); 
     if PWord(currp)^ = $FFFE then begin 
     // BE, convert to LE 
     for I := 1 to Length(W) do begin 
      Ch := W[I]; 
      W[I] := WideChar(((Word(Ch) and $FF) shl 8) or (Word(Ch) shr 8)); 
     end; 
     end; 
    end; 
    $02: begin 
     // UTF-16BE without BOM, convert to LE 
     SetString(W, PWideChar(currp), datasize div SizeOf(WideChar)); 
     for I := 1 to Length(W) do begin 
     Ch := W[I]; 
     W[I] := WideChar(((Word(Ch) and $FF) shl 8) or (Word(Ch) shr 8)); 
     end; 
    end; 
    $03: begin 
     // UTF-8 
     I := MultiByteToWideChar(65001, 0, PAnsiChar(currp), datasize, nil, 0); 
     if I > 0 then begin 
     SetLength(W, I); 
     MultiByteToWideChar(65001, 0, PAnsiChar(currp), datasize, PWideChar(W), I); 
     end; 
    end; 
    end; 
    Result := TrimRight(W); 
end;

。

// Delphi 2009+ - String is Unicode 
function Id3v2_string(currp: Pointer; datasize: Integer; Encoding: Byte): String; 
var 
    Enc: TEncoding; 

    function Convert(P: Pointer; Size: Integer): String; 
    var 
    Buf: TBytes; 
    begin 
    SetLength(Buf, Size); 
    if Size > 0 then Move(P^, Buf[0], Size); 
    Result := Enc.GetString(Buf); 
    end; 

begin 
    Result := ''; 

    case Encoding of 
    $00: begin 
     // ISO-8859-1 
     Enc := TEncoding.GetEncoding(28591); 
     try 
     Result := Convert(currp, datasize); 
     finally 
     Enc.Free; 
     end; 
    end; 
    $01: begin 
     // UTF-16 with BOM 
     if PWord(currp)^ = $FFFE then begin 
     // BE 
     Enc := TEncoding.BigEndianUnicode; 
     end else begin 
     // LE 
     Enc := TEncoding.Unicode; 
     end; 
     Result := Convert(PWord(currp)+1, datasize - SizeOf(Word)); 
    end; 
    $02: begin 
     // UTF-16BE without BOM 
     Enc := TEncoding.BigEndianUnicode; 
     Result := Convert(currp, datasize); 
    end; 
    $03: begin 
     // UTF-8 
     Enc := TEncoding.UTF8; 
     Result := Convert(currp, datasize); 
    end; 
    end; 
    Result := TrimRight(Result); 
end;

明白了。谢谢！ – Glenn1234 2012-03-03 02:28:10

Unicode字符串中的奇数字符

相关推荐