Browse Source

2010-10-02 Tatsuhiro Tsujikawa <t-tujikawa@users.sourceforge.net>

	Non-UTF8 filenames are now percent-encoded.  For example, filename
	for http://example.org/%90%A2%8AE will be %90%A2%8AE because it is
	Shift_JIS. The comments and name in .torrent file in XML-RPC
	response are percent-encoded if they are not UTF-8.
	* src/FtpNegotiationCommand.cc
	* src/HttpRequestCommand.cc
	* src/HttpResponseCommand.cc
	* src/XmlRpcMethodImpl.cc
	* src/bittorrent_helper.cc
	* src/util.cc
	* src/util.h
	* test/BittorrentHelperTest.cc
	* test/UtilTest.cc
Tatsuhiro Tsujikawa 15 years ago
parent
commit
d956ea0b70

+ 16 - 0
ChangeLog

@@ -1,3 +1,19 @@
+2010-10-02  Tatsuhiro Tsujikawa  <t-tujikawa@users.sourceforge.net>
+
+	Non-UTF8 filenames are now percent-encoded.  For example, filename
+	for http://example.org/%90%A2%8AE will be %90%A2%8AE because it is
+	Shift_JIS. The comments and name in .torrent file in XML-RPC
+	response are percent-encoded if they are not UTF-8.
+	* src/FtpNegotiationCommand.cc
+	* src/HttpRequestCommand.cc
+	* src/HttpResponseCommand.cc
+	* src/XmlRpcMethodImpl.cc
+	* src/bittorrent_helper.cc
+	* src/util.cc
+	* src/util.h
+	* test/BittorrentHelperTest.cc
+	* test/UtilTest.cc
+
 2010-09-26  Tatsuhiro Tsujikawa  <t-tujikawa@users.sourceforge.net>
 
 	Renamed TripletGet as TupleGet. Renamed TripletNthType as

+ 2 - 3
src/FtpNegotiationCommand.cc

@@ -377,10 +377,9 @@ bool FtpNegotiationCommand::onFileSizeDetermined(uint64_t totalLength)
   getFileEntry()->setLength(totalLength);
   if(getFileEntry()->getPath().empty()) {
     getFileEntry()->setPath
-      (util::applyDir
+      (util::createSafePath
        (getDownloadContext()->getDir(),
-        util::fixTaintedBasename
-        (util::percentDecode(getRequest()->getFile()))));
+        util::percentDecode(getRequest()->getFile())));
   }
   getRequestGroup()->preDownloadProcessing();
   if(getDownloadEngine()->getRequestGroupMan()->

+ 2 - 2
src/HttpRequestCommand.cc

@@ -159,9 +159,9 @@ bool HttpRequestCommand::executeInternal() {
         } else {
           if(getFileEntry()->getPath().empty()) {
             getFileEntry()->setPath
-              (util::applyDir
+              (util::createSafePath
                (getDownloadContext()->getDir(),
-                util::fixTaintedBasename(getRequest()->getFile())));
+                util::percentDecode(getRequest()->getFile())));
           }
           File ctrlfile(getFileEntry()->getPath()+
                         DefaultBtProgressInfoFile::getSuffix());

+ 2 - 3
src/HttpResponseCommand.cc

@@ -168,9 +168,8 @@ bool HttpResponseCommand::executeInternal()
     getFileEntry()->setLength(totalLength);
     if(getFileEntry()->getPath().empty()) {
       getFileEntry()->setPath
-        (util::applyDir
-         (getDownloadContext()->getDir(),
-          util::fixTaintedBasename(httpResponse->determinFilename())));
+        (util::createSafePath
+         (getDownloadContext()->getDir(), httpResponse->determinFilename()));
     }
     getFileEntry()->setContentType(httpResponse->getContentType());
     getRequestGroup()->preDownloadProcessing();

+ 2 - 2
src/XmlRpcMethodImpl.cc

@@ -597,7 +597,7 @@ void gatherBitTorrentMetadata
  const SharedHandle<TorrentAttribute>& torrentAttrs)
 {
   if(!torrentAttrs->comment.empty()) {
-    btDict->put(KEY_COMMENT, torrentAttrs->comment);
+    btDict->put(KEY_COMMENT, util::encodeNonUtf8(torrentAttrs->comment));
   }
   if(torrentAttrs->creationDate) {
     btDict->put(KEY_CREATION_DATE, Integer::g(torrentAttrs->creationDate));
@@ -619,7 +619,7 @@ void gatherBitTorrentMetadata
   btDict->put(KEY_ANNOUNCE_LIST, destAnnounceList);
   if(!torrentAttrs->metadata.empty()) {
     SharedHandle<Dict> infoDict = Dict::g();
-    infoDict->put(KEY_NAME, torrentAttrs->name);
+    infoDict->put(KEY_NAME, util::encodeNonUtf8(torrentAttrs->name));
     btDict->put(KEY_INFO, infoDict);
   }
 }

+ 14 - 13
src/bittorrent_helper.cc

@@ -187,6 +187,7 @@ static void extractFileEntries
  const std::vector<std::string>& urlList)
 {
   std::string name;
+  std::string utf8Name;
   if(overrideName.empty()) {
     std::string nameKey;
     if(infoDict->containsKey(C_NAME_UTF8)) {
@@ -196,17 +197,18 @@ static void extractFileEntries
     }
     const String* nameData = asString(infoDict->get(nameKey));
     if(nameData) {
-      if(util::detectDirTraversal(nameData->s())) {
+      utf8Name = util::encodeNonUtf8(nameData->s());
+      if(util::detectDirTraversal(utf8Name)) {
         throw DL_ABORT_EX
           (StringFormat
            (MSG_DIR_TRAVERSAL_DETECTED,nameData->s().c_str()).str());
       }
       name = nameData->s();
     } else {
-      name = strconcat(File(defaultName).getBasename(), ".file");
+      name = utf8Name = strconcat(File(defaultName).getBasename(), ".file");
     }
   } else {
-    name = overrideName;
+    name = utf8Name = overrideName;
   }
   torrent->name = name;
   std::vector<SharedHandle<FileEntry> > fileEntries;
@@ -255,9 +257,11 @@ static void extractFileEntries
         }
       }
       std::string path = strjoin(pathelem.begin(), pathelem.end(), '/');
-      if(util::detectDirTraversal(path)) {
+      std::string utf8Path = strjoin(pathelem.begin(), pathelem.end(), '/',
+                                     std::ptr_fun(util::encodeNonUtf8));
+      if(util::detectDirTraversal(utf8Path)) {
         throw DL_ABORT_EX
-          (StringFormat(MSG_DIR_TRAVERSAL_DETECTED, path.c_str()).str());
+          (StringFormat(MSG_DIR_TRAVERSAL_DETECTED, utf8Path.c_str()).str());
       }
       std::string pePath =
         strjoin(pathelem.begin(), pathelem.end(), '/',
@@ -266,9 +270,8 @@ static void extractFileEntries
       std::vector<std::string> uris;
       createUri(urlList.begin(), urlList.end(),std::back_inserter(uris),pePath);
       SharedHandle<FileEntry> fileEntry
-        (new FileEntry(util::applyDir(ctx->getDir(), util::escapePath(path)),
-                       fileLengthData->i(),
-                       offset, uris));
+        (new FileEntry(util::applyDir(ctx->getDir(),util::escapePath(utf8Path)),
+                       fileLengthData->i(), offset, uris));
       fileEntry->setOriginalName(path);
       fileEntries.push_back(fileEntry);
       offset += fileEntry->getLength();
@@ -294,17 +297,15 @@ static void extractFileEntries
         uris.push_back(*i);
       }
     }
-
     SharedHandle<FileEntry> fileEntry
-      (new FileEntry(util::applyDir(ctx->getDir(), util::escapePath(name)),
-                     totalLength, 0,
-                     uris));
+      (new FileEntry(util::applyDir(ctx->getDir(), util::escapePath(utf8Name)),
+                     totalLength, 0, uris));
     fileEntry->setOriginalName(name);
     fileEntries.push_back(fileEntry);
   }
   ctx->setFileEntries(fileEntries.begin(), fileEntries.end());
   if(torrent->mode == MULTI) {
-    ctx->setBasePath(util::applyDir(ctx->getDir(), name));
+    ctx->setBasePath(util::applyDir(ctx->getDir(), utf8Name));
   }
 }
 

+ 89 - 0
src/util.cc

@@ -291,6 +291,80 @@ bool inRFC2616HttpToken(const char c)
     std::find(vbegin(chars), vend(chars), c) != vend(chars);
 }
 
+namespace {
+bool in(unsigned char ch, unsigned char s, unsigned char t)
+{
+  return s <= ch && ch <= t;
+}
+}
+
+namespace {
+bool isUtf8Tail(unsigned char ch)
+{
+  return in(ch, 0x80, 0xbf);
+}
+}
+
+bool isUtf8(const std::string& str)
+{
+  for(std::string::const_iterator s = str.begin(), eos = str.end(); s != eos;
+      ++s) {
+    unsigned char firstChar = *s;
+    // See ABNF in http://tools.ietf.org/search/rfc3629#section-4
+    if(in(firstChar, 0x20, 0x7e) ||
+       firstChar == 0x09 || firstChar == 0x0a ||firstChar == 0x0d) {
+      // UTF8-1 (without ctrl chars)
+    } else if(in(firstChar, 0xc2, 0xdf)) {
+       // UTF8-2
+      if(++s == eos || !isUtf8Tail(*s)) {
+        return false;
+      }
+    } else if(0xe0 == firstChar) {
+      // UTF8-3
+      if(++s == eos || !in(*s, 0xa0, 0xbf) ||
+         ++s == eos || !isUtf8Tail(*s)) {
+        return false;
+      }
+    } else if(in(firstChar, 0xe1, 0xec) || in(firstChar, 0xee, 0xef)) {
+      // UTF8-3
+      if(++s == eos || !isUtf8Tail(*s) ||
+         ++s == eos || !isUtf8Tail(*s)) {
+        return false;
+      }
+    } else if(0xed == firstChar) {
+      // UTF8-3
+      if(++s == eos || !in(*s, 0x80, 0x9f) ||
+         ++s == eos || !isUtf8Tail(*s)) {
+        return false;
+      } 
+    } else if(0xf0 == firstChar) {
+      // UTF8-4
+      if(++s == eos || !in(*s, 0x90, 0xbf) ||
+         ++s == eos || !isUtf8Tail(*s) ||
+         ++s == eos || !isUtf8Tail(*s)) {
+        return false;
+      }
+    } else if(in(firstChar, 0xf1, 0xf3)) {
+      // UTF8-4
+      if(++s == eos || !isUtf8Tail(*s) ||
+         ++s == eos || !isUtf8Tail(*s) ||
+         ++s == eos || !isUtf8Tail(*s)) {
+        return false;
+      }
+    } else if(0xf4 == firstChar) {
+      // UTF8-4
+      if(++s == eos || !in(*s, 0x80, 0x8f) ||
+         ++s == eos || !isUtf8Tail(*s) ||
+         ++s == eos || !isUtf8Tail(*s)) {
+        return false;
+      }
+    } else {
+      return false;
+    }
+  }
+  return true;
+}
+
 std::string percentEncode(const unsigned char* target, size_t len) {
   std::string dest;
   for(size_t i = 0; i < len; ++i) {
@@ -1406,6 +1480,21 @@ void executeHookByOptName
   }
 }
 
+std::string createSafePath
+(const std::string& dir, const std::string& filename)
+{
+  return util::applyDir
+    (dir,
+     util::isUtf8(filename)?
+     util::fixTaintedBasename(filename):
+     util::escapePath(util::percentEncode(filename)));
+}
+
+std::string encodeNonUtf8(const std::string& s)
+{
+  return util::isUtf8(s)?s:util::percentEncode(s);
+}
+
 } // namespace util
 
 } // namespace aria2

+ 6 - 0
src/util.h

@@ -148,6 +148,8 @@ bool inRFC3986ReservedChars(const char c);
 
 bool inRFC3986UnreservedChars(const char c);
 
+bool isUtf8(const std::string& str);
+
 std::string percentDecode(const std::string& target);
 
 std::string torrentPercentEncode(const unsigned char* target, size_t len);
@@ -405,6 +407,10 @@ void executeHookByOptName
 void executeHookByOptName
 (const RequestGroup* group, const Option* option, const std::string& opt);
 
+std::string createSafePath(const std::string& dir, const std::string& filename);
+
+std::string encodeNonUtf8(const std::string& s);
+
 } // namespace util
 
 } // namespace aria2

+ 48 - 0
test/BittorrentHelperTest.cc

@@ -52,6 +52,8 @@ class BittorrentHelperTest:public CppUnit::TestFixture {
   CPPUNIT_TEST(testLoadFromMemory_overrideName);
   CPPUNIT_TEST(testLoadFromMemory_multiFileDirTraversal);
   CPPUNIT_TEST(testLoadFromMemory_singleFileDirTraversal);
+  CPPUNIT_TEST(testLoadFromMemory_multiFileNonUtf8Path);
+  CPPUNIT_TEST(testLoadFromMemory_singleFileNonUtf8Path);
   CPPUNIT_TEST(testGetNodes);
   CPPUNIT_TEST(testGetBasePath);
   CPPUNIT_TEST(testSetFileFilter_single);
@@ -102,6 +104,8 @@ public:
   void testLoadFromMemory_overrideName();
   void testLoadFromMemory_multiFileDirTraversal();
   void testLoadFromMemory_singleFileDirTraversal();
+  void testLoadFromMemory_multiFileNonUtf8Path();
+  void testLoadFromMemory_singleFileNonUtf8Path();
   void testGetNodes();
   void testGetBasePath();
   void testSetFileFilter_single();
@@ -400,6 +404,50 @@ void BittorrentHelperTest::testGetFileEntries_singleFileUrlListEndsWithSlash() {
                        uris1[0]);
 }
 
+void BittorrentHelperTest::testLoadFromMemory_multiFileNonUtf8Path()
+{
+  SharedHandle<List> path = List::g();
+  path->append("path");
+  path->append(util::fromHex("90a28a")+"E");
+  SharedHandle<Dict> file = Dict::g();
+  file->put("length", Integer::g(1024));
+  file->put("path", path);
+  SharedHandle<List> files = List::g();
+  files->append(file);
+  SharedHandle<Dict> info = Dict::g();
+  info->put("files", files);
+  info->put("piece length", Integer::g(1024));
+  info->put("pieces", "01234567890123456789");
+  info->put("name", util::fromHex("1b")+"$B%O%m!<"+util::fromHex("1b")+"(B");
+  Dict dict;
+  dict.put("info", info);
+  SharedHandle<DownloadContext> dctx(new DownloadContext());
+  loadFromMemory(bencode2::encode(&dict), dctx, "default");
+
+  const SharedHandle<FileEntry>& fe = dctx->getFirstFileEntry();
+  CPPUNIT_ASSERT_EQUAL
+    (std::string("./%1B%24B%25O%25m%21%3C%1B%28B/path/%90%A2%8AE"),
+     fe->getPath());
+  CPPUNIT_ASSERT_EQUAL
+    (std::string("./%1B%24B%25O%25m%21%3C%1B%28B"), dctx->getBasePath());
+}
+
+void BittorrentHelperTest::testLoadFromMemory_singleFileNonUtf8Path()
+{
+  SharedHandle<Dict> info = Dict::g();
+  info->put("piece length", Integer::g(1024));
+  info->put("pieces", "01234567890123456789");
+  info->put("name", util::fromHex("90a28a")+"E");
+  info->put("length", Integer::g(1024));
+  Dict dict;
+  dict.put("info", info);
+  SharedHandle<DownloadContext> dctx(new DownloadContext());
+  loadFromMemory(bencode2::encode(&dict), dctx, "default");
+
+  const SharedHandle<FileEntry>& fe = dctx->getFirstFileEntry();
+  CPPUNIT_ASSERT_EQUAL(std::string("./%90%A2%8AE"), fe->getPath());
+}
+
 void BittorrentHelperTest::testLoadFromMemory()
 {
   std::string memory = "d8:announce36:http://aria.rednoah.com/announce.php13:announce-listll16:http://tracker1 el15:http://tracker2el15:http://tracker3ee7:comment17:REDNOAH.COM RULES13:creation datei1123456789e4:infod5:filesld6:lengthi284e4:pathl5:aria23:src6:aria2ceed6:lengthi100e4:pathl19:aria2-0.2.2.tar.bz2eee4:name10:aria2-test12:piece lengthi128e6:pieces60:AAAAAAAAAAAAAAAAAAAABBBBBBBBBBBBBBBBBBBBCCCCCCCCCCCCCCCCCCCCee";

+ 34 - 0
test/UtilTest.cc

@@ -65,6 +65,7 @@ class UtilTest:public CppUnit::TestFixture {
   CPPUNIT_TEST(testEscapePath);
   CPPUNIT_TEST(testGetCidrPrefix);
   CPPUNIT_TEST(testInSameCidrBlock);
+  CPPUNIT_TEST(testIsUtf8String);
   CPPUNIT_TEST_SUITE_END();
 private:
 
@@ -118,6 +119,7 @@ public:
   void testEscapePath();
   void testGetCidrPrefix();
   void testInSameCidrBlock();
+  void testIsUtf8String();
 };
 
 
@@ -1098,4 +1100,36 @@ void UtilTest::testInSameCidrBlock()
   CPPUNIT_ASSERT(!util::inSameCidrBlock("192.168.128.1", "192.168.0.1", 17));
 }
 
+void UtilTest::testIsUtf8String()
+{
+  CPPUNIT_ASSERT(util::isUtf8("ascii"));
+  // "Hello World" in Japanese UTF-8
+  CPPUNIT_ASSERT(util::isUtf8
+                 (util::fromHex("e38193e38293e381abe381a1e381afe4b896e7958c")));
+  // "World" in Shift_JIS
+  CPPUNIT_ASSERT(!util::isUtf8(util::fromHex("90a28a")+"E"));
+  // UTF8-2
+  CPPUNIT_ASSERT(util::isUtf8(util::fromHex("c280")));
+  CPPUNIT_ASSERT(util::isUtf8(util::fromHex("dfbf")));
+  // UTF8-3
+  CPPUNIT_ASSERT(util::isUtf8(util::fromHex("e0a080")));
+  CPPUNIT_ASSERT(util::isUtf8(util::fromHex("e0bf80")));
+  CPPUNIT_ASSERT(util::isUtf8(util::fromHex("e18080")));
+  CPPUNIT_ASSERT(util::isUtf8(util::fromHex("ec8080")));
+  CPPUNIT_ASSERT(util::isUtf8(util::fromHex("ed8080")));
+  CPPUNIT_ASSERT(util::isUtf8(util::fromHex("ed9f80")));
+  CPPUNIT_ASSERT(util::isUtf8(util::fromHex("ee8080")));
+  CPPUNIT_ASSERT(util::isUtf8(util::fromHex("ef8080")));
+  // UTF8-4
+  CPPUNIT_ASSERT(util::isUtf8(util::fromHex("f0908080")));
+  CPPUNIT_ASSERT(util::isUtf8(util::fromHex("f0bf8080")));
+  CPPUNIT_ASSERT(util::isUtf8(util::fromHex("f1808080")));
+  CPPUNIT_ASSERT(util::isUtf8(util::fromHex("f3808080")));
+  CPPUNIT_ASSERT(util::isUtf8(util::fromHex("f4808080")));
+  CPPUNIT_ASSERT(util::isUtf8(util::fromHex("f48f8080")));
+
+  CPPUNIT_ASSERT(util::isUtf8(""));
+  CPPUNIT_ASSERT(!util::isUtf8(util::fromHex("00")));
+}
+
 } // namespace aria2