htmlspecialcharsのパッチ私案 - 岩本隆史の日記帳（アーカイブ）

PHPのhtmlspecialchars関数について、文字エンコーディングの妥当性チェックが不充分であること、また場合によってはXSS攻撃が可能になることが、下記ふたつの記事で指摘されています。

徳丸さんの記事では、仕様変更も提案されています。

冗長なUTF-8は不正なエンコーディングとして扱う（出力を空にする）

Shift_JIS、EUC-JPの2バイト目が不正な場合も、エラーとして出力を空にする

htmlspecialcharsは不正な文字エンコーディングをどこまでチェックするか | 徳丸浩の日記

さらに、id:t_komuraさんの記事をうければ：

不正なバイトが単独で指定された場合も、エラーとして出力を空にする

という仕様も取り込むべきでしょう。

パッチを書いてみた

私はPHP本体のコードはもちろん、C言語の知識もほとんどないのですが、妥当性チェック部分のコードを読んでみて、仕様変更パッチが簡単に書けそうだと思ったので、書いてみました。ただし勉強がてらなので、まずはShift_JISとEUC-JPにのみ手をつけています。PHPのバージョンは5.3.0です。

$ diff -u html.c.old html.c.new
--- html.c.old    2009-10-05 12:38:12.000000000 +0900
+++ html.c.new    2009-10-05 12:55:20.000000000 +0900
@@ -648,9 +648,14 @@
                         MB_WRITE(next_char);
                         this_char |= next_char;
                         pos++;
-                    }
-                    
-                }
+                    } else {
+                        *status = FAILURE;
+                    }
+                } else if (!(this_char >= 0x00 && this_char <= 0x7f) &&
+                           !(this_char >= 0xa1 && this_char <= 0xdf))
+                {
+                    *status = FAILURE;
+                }
                 break;
             }
         case cs_eucjp:
@@ -666,8 +671,9 @@
                         MB_WRITE(next_char);
                         this_char |= next_char;
                         pos++;
-                    }
-                    
+                    } else {
+                        *status = FAILURE;
+                    }
                 } else if (this_char == 0x8e) {
                     /* peek at the next char */
                     CHECK_LEN(pos, 1);
@@ -678,8 +684,9 @@
                         MB_WRITE(next_char);
                         this_char |= next_char;
                         pos++;
-                    }
-                    
+                    } else {
+                        *status = FAILURE;
+                    }
                 } else if (this_char == 0x8f) {
                     /* peek at the next two char */
                     unsigned char next2_char;
@@ -697,9 +704,12 @@
                         MB_WRITE(next2_char);
                         this_char |= next2_char;
                         pos++;
-                    }
-                    
-                }
+                    } else {
+                        *status = FAILURE;
+                    }
+                } else if (!(this_char >= 0x00 && this_char <= 0x7f)) {
+                    *status = FAILURE;
+                }
                 break;
             }
         default:

diffでは分かりづらいので、パッチ適用前後のコードを載せておきます。変更されるのは、ext/standard/html.c で定義されている get_next_char 関数の一部です。

変更前

        case cs_sjis:
            {
                /* check if this is the first of a 2-byte sequence */
                if ( (this_char >= 0x81 && this_char <= 0x9f) ||
                     (this_char >= 0xe0 && this_char <= 0xef)
                    ) {
                    /* peek at the next char */
                    CHECK_LEN(pos, 1);
                    next_char = str[pos];
                    if ((next_char >= 0x40 && next_char <= 0x7e) ||
                        (next_char >= 0x80 && next_char <= 0xfc))
                    {
                        /* yes, this a wide char */
                        this_char <<= 8;
                        MB_WRITE(next_char);
                        this_char |= next_char;
                        pos++;
                    }
                    
                }
                break;
            }
        case cs_eucjp:
            {
                /* check if this is the first of a multi-byte sequence */
                if (this_char >= 0xa1 && this_char <= 0xfe) {
                    /* peek at the next char */
                    CHECK_LEN(pos, 1);
                    next_char = str[pos];
                    if (next_char >= 0xa1 && next_char <= 0xfe) {
                        /* yes, this a jis kanji char */
                        this_char <<= 8;
                        MB_WRITE(next_char);
                        this_char |= next_char;
                        pos++;
                    }
                    
                } else if (this_char == 0x8e) {
                    /* peek at the next char */
                    CHECK_LEN(pos, 1);
                    next_char = str[pos];
                    if (next_char >= 0xa1 && next_char <= 0xdf) {
                        /* JIS X 0201 kana */
                        this_char <<= 8;
                        MB_WRITE(next_char);
                        this_char |= next_char;
                        pos++;
                    }
                    
                } else if (this_char == 0x8f) {
                    /* peek at the next two char */
                    unsigned char next2_char;
                    CHECK_LEN(pos, 2);
                    next_char = str[pos];
                    next2_char = str[pos+1];
                    if ((next_char >= 0xa1 && next_char <= 0xfe) &&
                        (next2_char >= 0xa1 && next2_char <= 0xfe)) {
                        /* JIS X 0212 hojo-kanji */
                        this_char <<= 8;
                        MB_WRITE(next_char);
                        this_char |= next_char;
                        pos++;
                        this_char <<= 8;
                        MB_WRITE(next2_char);
                        this_char |= next2_char;
                        pos++;
                    }
                    
                }
                break;
            }

変更後

        case cs_sjis:
            {
                /* check if this is the first of a 2-byte sequence */
                if ( (this_char >= 0x81 && this_char <= 0x9f) ||
                     (this_char >= 0xe0 && this_char <= 0xef)
                    ) {
                    /* peek at the next char */
                    CHECK_LEN(pos, 1);
                    next_char = str[pos];
                    if ((next_char >= 0x40 && next_char <= 0x7e) ||
                        (next_char >= 0x80 && next_char <= 0xfc))
                    {
                        /* yes, this a wide char */
                        this_char <<= 8;
                        MB_WRITE(next_char);
                        this_char |= next_char;
                        pos++;
                    } else {
                        *status = FAILURE;
                    }
                } else if (!(this_char >= 0x00 && this_char <= 0x7f) &&
                           !(this_char >= 0xa1 && this_char <= 0xdf))
                {
                    *status = FAILURE;
                }
                break;
            }
        case cs_eucjp:
            {
                /* check if this is the first of a multi-byte sequence */
                if (this_char >= 0xa1 && this_char <= 0xfe) {
                    /* peek at the next char */
                    CHECK_LEN(pos, 1);
                    next_char = str[pos];
                    if (next_char >= 0xa1 && next_char <= 0xfe) {
                        /* yes, this a jis kanji char */
                        this_char <<= 8;
                        MB_WRITE(next_char);
                        this_char |= next_char;
                        pos++;
                    } else {
                        *status = FAILURE;
                    }
                } else if (this_char == 0x8e) {
                    /* peek at the next char */
                    CHECK_LEN(pos, 1);
                    next_char = str[pos];
                    if (next_char >= 0xa1 && next_char <= 0xdf) {
                        /* JIS X 0201 kana */
                        this_char <<= 8;
                        MB_WRITE(next_char);
                        this_char |= next_char;
                        pos++;
                    } else {
                        *status = FAILURE;
                    }
                } else if (this_char == 0x8f) {
                    /* peek at the next two char */
                    unsigned char next2_char;
                    CHECK_LEN(pos, 2);
                    next_char = str[pos];
                    next2_char = str[pos+1];
                    if ((next_char >= 0xa1 && next_char <= 0xfe) &&
                        (next2_char >= 0xa1 && next2_char <= 0xfe)) {
                        /* JIS X 0212 hojo-kanji */
                        this_char <<= 8;
                        MB_WRITE(next_char);
                        this_char |= next_char;
                        pos++;
                        this_char <<= 8;
                        MB_WRITE(next2_char);
                        this_char |= next2_char;
                        pos++;
                    } else {
                        *status = FAILURE;
                    }
                } else if (!(this_char >= 0x00 && this_char <= 0x7f)) {
                    *status = FAILURE;
                }
                break;
            }

簡単なテストケースはパスした

パッチ適用後のPHP5.3.0をビルドし、徳丸さんのテストコードを拝借して検証してみました。単独不正バイトのテストを追加しています。

テストコード：

<?php
function test($s, $enc) {
  $e = htmlspecialchars($s, ENT_QUOTES, $enc);
  echo bin2hex($s) . ':' . $s . ' -> ';
  echo bin2hex($e) . ':' . $e . "\n";
}

  echo phpversion() . "\n";

  test("\xC0\xAF", 'UTF-8');                  // 「/」の冗長表現(2バイト)
  test("\xE0\x80\xAF", 'UTF-8');              // 「/」の冗長表現(3バイト)
  test("\xF0\x80\x80\xAF", 'UTF-8');          // 「/」の冗長表現(4バイト)
  test("\xF8\x80\x80\x80\xAF", 'UTF-8');      // 「/」の冗長表現(5バイト)
  test("\xFC\x80\x80\x80\x80\xAF", 'UTF-8');  // 「/」の冗長表現(6バイト)
echo "-------------------------------\n";
  test("\xC0\xBC", 'UTF-8');                  // 「<」の冗長表現(2バイト)
  test("\xE0\x80\xBC", 'UTF-8');              // 「<」の冗長表現(3バイト)
  test("\xF0\x80\x80\xBC", 'UTF-8');          // 「<」の冗長表現(4バイト)
  test("\xF8\x80\x80\x80\xBC", 'UTF-8');      // 「<」の冗長表現(5バイト)
  test("\xFC\x80\x80\x80\x80\xBC", 'UTF-8');  // 「<」の冗長表現(6バイト)
echo "-------------------------------\n";
  test("A\xC2", 'UTF-8');                     // C2 (2バイトパターンなのに1バイト)
  test("A\xC2<", 'UTF-8');                    // C2 に < が続く
  test("A\xC2/", 'UTF-8');                    // C2 に / が続く
  test("A\xE6\xBC", 'UTF-8');                 // E6 BC (3バイトパターンなのに2バイト)
  test("A\xE6\xBC<", 'UTF-8');                // E6 BC に < が続く
  test("A\xE6\xBC/", 'UTF-8');                // E6 BC に / が続く
  test("\xF0", 'UTF-8');                      // F0（単独の不正バイト）
echo "-------------------------------\n";
  test("A\x8A", 'Shift_JIS');                 // 8A (Shift_JISの先行バイト)
  test("A\x8A/", 'Shift_JIS');                // 8A / が続く
  test("A\x8A<", 'Shift_JIS');                // 8A < が続く
  test("\xF0", 'Shift_JIS');                  // F0（単独の不正バイト）
echo "-------------------------------\n";
  test("A\xB4", 'EUC-JP');                    // B4 (EUC-JPの先行バイト)
  test("A\xB4/", 'EUC-JP');                   // B4 に / が続く
  test("A\xB4<", 'EUC-JP');                   // B4 に < が続く
  test("\xF0", 'EUC-JP');                     // F0（単独の不正バイト）

出力結果はこうでした。

5.3.0
c0af:■ -> c0af:■
e080af:■ -> e080af:■
f08080af:■ -> f08080af:■
f8808080af:■ -> f8808080af:■
fc80808080af:■ -> fc80808080af:■
-------------------------------
c0bc:■ -> 266c743b:&lt;
e080bc:■ -> 266c743b:&lt;
f08080bc:■ -> 266c743b:&lt;
f8808080bc:■ -> 266c743b:&lt;
fc80808080bc:■ -> 266c743b:&lt;
-------------------------------
41c2:A■ -> :
41c23c:A■< -> :
41c22f:A■/ -> :
41e6bc:A■ -> :
41e6bc3c:A■< -> :
41e6bc2f:A■/ -> :
f0:■ -> :
-------------------------------
418a:A■ -> :
418a2f:A■/ -> :
418a3c:A■< -> :
f0:■ -> :
-------------------------------
41b4:A■ -> :
41b42f:A■/ -> :
41b43c:A■< -> :
f0:■ -> :

Shift_JISもEUC-JPも、全ケースの出力が空になっています。所期の目的は達成できました。なお、徳丸さんの環境では化けていないUTF-8の出力が私の環境で化けているのは、コンソールの違いによるものですので無視してください。

繰り返しになりますが、勉強がてらに書いたパッチなので、不適当な部分があるかもしれません。お気づきの方は、ぜひご教示ください。

今後どうするか

今後、UTF-8の分のパッチも書いてみようとは思っていますが、早い方なら数分で書けるような内容でしょうから、「俺が書かなきゃ誰が書く」といった高いモチベーションはありません。「知らない言語でFizzBuzzでも書いてみようか」という感じです。

がんばって本家に提案とかしたほうがいいんでしょうかね。http://github.com/php/php-src をforkすればいいの？

追記（2009-10-05）

UTF-8の分も書いてみました。

--- html.c.new.org    2009-10-05 12:55:20.000000000 +0900
+++ html.c.new.utf8    2009-10-05 21:59:52.000000000 +0900
@@ -519,95 +519,183 @@
         case cs_utf_8:
             {
                 unsigned long utf = 0;
-                int stat = 0;
-                int more = 1;
 
-                /* unpack utf-8 encoding into a wide char.
-                 * Code stolen from the mbstring extension */
+                if (this_char >= 0xc2 && this_char <= 0xdf) {
+                    utf = (this_char & 0x1f) << 6;
+                    CHECK_LEN(pos, 1);
+
+                    this_char = str[pos++];
+                    MB_WRITE((unsigned char)this_char);
+                    if (this_char >= 0x80 && this_char <= 0xbf) {
+                        utf |= (this_char & 0x3f);
+                        this_char = (unsigned short)utf;
+                    } else {
+                        *status = FAILURE;
+                    }
 
-                do {
-                    if (this_char < 0x80) {
-                        more = 0;
-                        if(stat) {
-                            /* we didn't finish the UTF sequence correctly */
-                            --pos;
-                            *status = FAILURE;
-                        }
-                        break;
-                    } else if (this_char < 0xc0) {
-                        switch (stat) {
-                            case 0x10:    /* 2, 2nd */
-                            case 0x21:    /* 3, 3rd */
-                            case 0x32:    /* 4, 4th */
-                            case 0x43:    /* 5, 5th */
-                            case 0x54:    /* 6, 6th */
-                                /* last byte in sequence */
-                                more = 0;
-                                utf |= (this_char & 0x3f);
-                                this_char = (unsigned short)utf;
-                                break;
-                            case 0x20:    /* 3, 2nd */
-                            case 0x31:    /* 4, 3rd */
-                            case 0x42:    /* 5, 4th */
-                            case 0x53:    /* 6, 5th */
-                                /* penultimate char */
-                                utf |= ((this_char & 0x3f) << 6);
-                                stat++;
-                                break;
-                            case 0x30:    /* 4, 2nd */
-                            case 0x41:    /* 5, 3rd */
-                            case 0x52:    /* 6, 4th */
-                                utf |= ((this_char & 0x3f) << 12);
-                                stat++;
-                                break;
-                            case 0x40:    /* 5, 2nd */
-                            case 0x51:
-                                utf |= ((this_char & 0x3f) << 18);
-                                stat++;
-                                break;
-                            case 0x50:    /* 6, 2nd */
-                                utf |= ((this_char & 0x3f) << 24);
-                                stat++;
-                                break;
-                            default:
-                                /* invalid */
-                                *status = FAILURE;
-                                more = 0;
-                        }
-                    }
-                    /* lead byte */
-                    else if (this_char < 0xe0) {
-                        stat = 0x10;    /* 2 byte */
-                        utf = (this_char & 0x1f) << 6;
-                        CHECK_LEN(pos, 1);
-                    } else if (this_char < 0xf0) {
-                        stat = 0x20;    /* 3 byte */
-                        utf = (this_char & 0xf) << 12;
-                        CHECK_LEN(pos, 2);
-                    } else if (this_char < 0xf8) {
-                        stat = 0x30;    /* 4 byte */
-                        utf = (this_char & 0x7) << 18;
-                        CHECK_LEN(pos, 3);
-                    } else if (this_char < 0xfc) {
-                        stat = 0x40;    /* 5 byte */
-                        utf = (this_char & 0x3) << 24;
-                        CHECK_LEN(pos, 4);
-                    } else if (this_char < 0xfe) {
-                        stat = 0x50;    /* 6 byte */
-                        utf = (this_char & 0x1) << 30;
-                        CHECK_LEN(pos, 5);
-                    } else {
-                        /* invalid; bail */
-                        more = 0;
-                        *status = FAILURE;
-                        break;
-                    }
+                } else if (this_char == 0xe0) {
+                    utf = (this_char & 0xf) << 12;
+                    CHECK_LEN(pos, 2);
+
+                    this_char = str[pos++];
+                    MB_WRITE((unsigned char)this_char);
+                    if (this_char >= 0xa0 && this_char <= 0xbf) {
+                        utf |= ((this_char & 0x3f) << 6);
+                        this_char = (unsigned short)utf;
+                    } else {
+                        *status = FAILURE;
+                    }
 
-                    if (more) {
-                        this_char = str[pos++];
-                        MB_WRITE((unsigned char)this_char);
-                    }
-                } while (more);
+                    this_char = str[pos++];
+                    MB_WRITE((unsigned char)this_char);
+                    if (this_char >= 0x80 && this_char <= 0xbf) {
+                        utf |= (this_char & 0x3f);
+                        this_char = (unsigned short)utf;
+                    } else {
+                        *status = FAILURE;
+                    }
+
+                } else if ((this_char >= 0xe1 && this_char <= 0xec) ||
+                           (this_char >= 0xee && this_char <= 0xef)) {
+                    utf = (this_char & 0xf) << 12;
+                    CHECK_LEN(pos, 2);
+
+                    this_char = str[pos++];
+                    MB_WRITE((unsigned char)this_char);
+                    if (this_char >= 0x80 && this_char <= 0xbf) {
+                        utf |= ((this_char & 0x3f) << 6);
+                        this_char = (unsigned short)utf;
+                    } else {
+                        *status = FAILURE;
+                    }
+
+                    this_char = str[pos++];
+                    MB_WRITE((unsigned char)this_char);
+                    if (this_char >= 0x80 && this_char <= 0xbf) {
+                        utf |= (this_char & 0x3f);
+                        this_char = (unsigned short)utf;
+                    } else {
+                        *status = FAILURE;
+                    }
+
+                } else if (this_char == 0xed) {
+                    utf = (this_char & 0xf) << 12;
+                    CHECK_LEN(pos, 2);
+
+                    this_char = str[pos++];
+                    MB_WRITE((unsigned char)this_char);
+                    if (this_char >= 0xa0 && this_char <= 0x9f) {
+                        utf |= ((this_char & 0x3f) << 6);
+                        this_char = (unsigned short)utf;
+                    } else {
+                        *status = FAILURE;
+                    }
+
+                    this_char = str[pos++];
+                    MB_WRITE((unsigned char)this_char);
+                    if (this_char >= 0x80 && this_char <= 0xbf) {
+                        utf |= (this_char & 0x3f);
+                        this_char = (unsigned short)utf;
+                    } else {
+                        *status = FAILURE;
+                    }
+
+                } else if (this_char == 0xf0) {
+                    utf = (this_char & 0x7) << 18;
+                    CHECK_LEN(pos, 3);
+
+                    this_char = str[pos++];
+                    MB_WRITE((unsigned char)this_char);
+                    if (this_char >= 0x90 && this_char <= 0xbf) {
+                        utf |= ((this_char & 0x3f) << 12);
+                        this_char = (unsigned short)utf;
+                    } else {
+                        *status = FAILURE;
+                    }
+
+                    this_char = str[pos++];
+                    MB_WRITE((unsigned char)this_char);
+                    if (this_char >= 0x80 && this_char <= 0xbf) {
+                        utf |= ((this_char & 0x3f) << 6);
+                        this_char = (unsigned short)utf;
+                    } else {
+                        *status = FAILURE;
+                    }
+
+                    this_char = str[pos++];
+                    MB_WRITE((unsigned char)this_char);
+                    if (this_char >= 0x80 && this_char <= 0xbf) {
+                        utf |= (this_char & 0x3f);
+                        this_char = (unsigned short)utf;
+                    } else {
+                        *status = FAILURE;
+                    }
+
+                } else if (this_char >= 0xf1 && this_char <= 0xf3) {
+                    utf = (this_char & 0x7) << 18;
+                    CHECK_LEN(pos, 3);
+
+                    this_char = str[pos++];
+                    MB_WRITE((unsigned char)this_char);
+                    if (this_char >= 0x80 && this_char <= 0xbf) {
+                        utf |= ((this_char & 0x3f) << 12);
+                        this_char = (unsigned short)utf;
+                    } else {
+                        *status = FAILURE;
+                    }
+
+                    this_char = str[pos++];
+                    MB_WRITE((unsigned char)this_char);
+                    if (this_char >= 0x80 && this_char <= 0xbf) {
+                        utf |= ((this_char & 0x3f) << 6);
+                        this_char = (unsigned short)utf;
+                    } else {
+                        *status = FAILURE;
+                    }
+
+                    this_char = str[pos++];
+                    MB_WRITE((unsigned char)this_char);
+                    if (this_char >= 0x80 && this_char <= 0xbf) {
+                        utf |= (this_char & 0x3f);
+                        this_char = (unsigned short)utf;
+                    } else {
+                        *status = FAILURE;
+                    }
+
+                } else if (this_char == 0xf4) {
+                    utf = (this_char & 0x7) << 18;
+                    CHECK_LEN(pos, 3);
+
+                    this_char = str[pos++];
+                    MB_WRITE((unsigned char)this_char);
+                    if (this_char >= 0x80 && this_char <= 0x8f) {
+                        utf |= ((this_char & 0x3f) << 12);
+                        this_char = (unsigned short)utf;
+                    } else {
+                        *status = FAILURE;
+                    }
+
+                    this_char = str[pos++];
+                    MB_WRITE((unsigned char)this_char);
+                    if (this_char >= 0x80 && this_char <= 0xbf) {
+                        utf |= ((this_char & 0x3f) << 6);
+                        this_char = (unsigned short)utf;
+                    } else {
+                        *status = FAILURE;
+                    }
+
+                    this_char = str[pos++];
+                    MB_WRITE((unsigned char)this_char);
+                    if (this_char >= 0x80 && this_char <= 0xbf) {
+                        utf |= (this_char & 0x3f);
+                        this_char = (unsigned short)utf;
+                    } else {
+                        *status = FAILURE;
+                    }
+
+                } else if (!(this_char >= 0x00 && this_char <= 0x7f)) {
+                    *status = FAILURE;
+                }
             }
             break;
         case cs_big5:

id:t_komuraさんが「mb_check_encoding() の代替関数」で書かれた正規表現に従っているつもりです。if文の羅列で見苦しいですが、どうしたらいいのか分からず、そのままにしました。普通はマクロを使うんでしょうか。

前掲のテストを走らせると、冗長表現のテストケースも期待どおり空になりました。デグレがあったらすみません。

追記その2（2009-10-06）

当初、単独のxf0〜xfcがShift_JISでスルーされることのみを問題視していたのですが、不正な単独バイトをスルーするのが不適切な仕様だと気づいたので、記事の表現やテストケースを一部改めました。

今後ですが、今回得られた知見にもとづいて、PHPのバグレポートを提出するつもりです。マージされるよう願っています。

追記その3（2009-10-06）

そんなわけで、バグレポートを提出しました。

PHP :: Bug #49785 :: htmlspecialchars() should check byte sequence more strictly

バグじゃないといわれる可能性もあります。そうなったら、どなたか英語の得意な方、援軍してください＞＜