瀏覽代碼

SessionSerializer: Truly unique URIs

Before, only spent uris where sanitized not to be contained within
remaining uris. Change this so that each uri in the
union(remaining,spent) get saved once at most.
The order of the uris will won't be changed, with remaining uris going
first followed by spent uris.

Also avoid copying the uri std::strings around during dupe checking,
usually resulting in better performance regarding CPU and space.
Nils Maier 12 年之前
父節點
當前提交
3b32dcb9f1
共有 2 個文件被更改,包括 57 次插入31 次删除
  1. 45 30
      src/SessionSerializer.cc
  2. 12 1
      test/SessionSerializerTest.cc

+ 45 - 30
src/SessionSerializer.cc

@@ -137,24 +137,41 @@ bool writeOption(IOFile& fp, const SharedHandle<Option>& op)
 } // namespace
 
 namespace {
-bool writeUri(IOFile& fp, const std::string& uri)
-{
-  return fp.write(uri.c_str(), uri.size()) == uri.size() &&
-    fp.write("\t", 1) == 1;
-}
-} // namespace
+  template<typename T>
+  class Unique {
+    typedef T type;
+    struct PointerCmp {
+      inline bool operator()(const type* x, const type* y) {
+        return *x < *y;
+      }
+    };
+    std::set<const type*, PointerCmp> known;
+  public:
+    inline bool operator()(const type& v) {
+      return known.insert(&v).second;
+    }
+  };
 
-namespace {
-template<typename InputIterator>
-bool writeUri(IOFile& fp, InputIterator first, InputIterator last)
-{
-  for(; first != last; ++first) {
-    if(!writeUri(fp, *first)) {
-      return false;
+  bool writeUri(IOFile& fp, const std::string& uri)
+  {
+    return fp.write(uri.c_str(), uri.size()) == uri.size() &&
+      fp.write("\t", 1) == 1;
+  }
+
+  template<typename InputIterator, class UnaryPredicate>
+  bool writeUri(IOFile& fp, InputIterator first, InputIterator last,
+                UnaryPredicate& filter)
+  {
+    for(; first != last; ++first) {
+      if (!filter(*first)) {
+        continue;
+      }
+      if(!writeUri(fp, *first)) {
+        return false;
+      }
     }
+    return true;
   }
-  return true;
-}
 } // namespace
 
 // The downloads whose followedBy() is empty is persisited with its
@@ -196,29 +213,27 @@ bool writeDownloadResult
     }
     const SharedHandle<FileEntry>& file = dr->fileEntries[0];
     // Don't save download if there are no URIs.
-    if(file->getRemainingUris().empty() &&
-       file->getSpentUris().empty()) {
+    const bool hasRemaining = !file->getRemainingUris().empty();
+    const bool hasSpent = !file->getSpentUris().empty();
+    if (!hasRemaining && !hasSpent) {
       return true;
     }
+
     // Save spent URIs + remaining URIs. Remove URI in spent URI which
     // also exists in remaining URIs.
-    std::set<std::string> uriSet(file->getRemainingUris().begin(),
-                                 file->getRemainingUris().end());
-    for(std::deque<std::string>::const_iterator i =
-          file->getSpentUris().begin(), eoi = file->getSpentUris().end();
-        i != eoi; ++i) {
-      if(uriSet.count(*i)) {
-        continue;
+    {
+      Unique<std::string> unique;
+      if (hasRemaining && !writeUri(fp, file->getRemainingUris().begin(),
+                                    file->getRemainingUris().end(),
+                                    unique)) {
+        return false;
       }
-      uriSet.insert(*i);
-      if(!writeUri(fp, *i)) {
+      if (hasSpent && !writeUri(fp, file->getSpentUris().begin(),
+                                file->getSpentUris().end(),
+                                unique)) {
         return false;
       }
     }
-    if(!writeUri(fp, file->getRemainingUris().begin(),
-                 file->getRemainingUris().end())) {
-      return false;
-    }
     if(fp.write("\n", 1) != 1) {
       return false;
     }

+ 12 - 1
test/SessionSerializerTest.cc

@@ -61,8 +61,19 @@ void SessionSerializerTest::testSave()
   };
   // This URI will be discarded because same URI exists in remaining
   // URIs.
+  drs[1]->fileEntries[0]->getRemainingUris().push_back("http://error");
+  drs[1]->fileEntries[0]->getRemainingUris().push_back("http://error3");
+  // This URI will be discarded because same URI exists in remaining
+  // URIs.
+  drs[1]->fileEntries[0]->getRemainingUris().push_back("http://error");
+  //
+  // This URI will be discarded because same URI exists in remaining
+  // URIs.
   drs[1]->fileEntries[0]->getSpentUris().push_back("http://error");
   drs[1]->fileEntries[0]->getSpentUris().push_back("http://error2");
+  // This URI will be discarded because same URI exists in remaining
+  // URIs.
+  drs[1]->fileEntries[0]->getSpentUris().push_back("http://error");
 
   drs[3]->option->put(PREF_FORCE_SAVE, A2_V_TRUE);
   for(size_t i = 0; i < sizeof(drs)/sizeof(drs[0]); ++i) {
@@ -79,7 +90,7 @@ void SessionSerializerTest::testSave()
   std::ifstream ss(filename.c_str(), std::ios::binary);
   std::string line;
   std::getline(ss, line);
-  CPPUNIT_ASSERT_EQUAL(std::string("http://error2\thttp://error\t"), line);
+  CPPUNIT_ASSERT_EQUAL(std::string("http://error\thttp://error3\thttp://error2\t"), line);
   std::getline(ss, line);
   CPPUNIT_ASSERT_EQUAL(fmt(" gid=%s", drs[1]->gid->toHex().c_str()), line);
   std::getline(ss, line);