<html xmlns:v="urn:schemas-microsoft-com:vml" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:w="urn:schemas-microsoft-com:office:word" xmlns:m="http://schemas.microsoft.com/office/2004/12/omml" xmlns="http://www.w3.org/TR/REC-html40">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
<meta name="Generator" content="Microsoft Word 15 (filtered medium)">
<style><!--
/* Font Definitions */
@font-face
{font-family:Wingdings;
panose-1:5 0 0 0 0 0 0 0 0 0;}
@font-face
{font-family:"Cambria Math";
panose-1:2 4 5 3 5 4 6 3 2 4;}
@font-face
{font-family:Calibri;
panose-1:2 15 5 2 2 2 4 3 2 4;}
/* Style Definitions */
p.MsoNormal, li.MsoNormal, div.MsoNormal
{margin:0cm;
margin-bottom:.0001pt;
font-size:12.0pt;
font-family:"Times New Roman",serif;}
a:link, span.MsoHyperlink
{mso-style-priority:99;
color:blue;
text-decoration:underline;}
a:visited, span.MsoHyperlinkFollowed
{mso-style-priority:99;
color:purple;
text-decoration:underline;}
p
{mso-style-priority:99;
margin:0cm;
margin-bottom:.0001pt;
font-size:12.0pt;
font-family:"Times New Roman",serif;}
p.msonormal0, li.msonormal0, div.msonormal0
{mso-style-name:msonormal;
margin:0cm;
margin-bottom:.0001pt;
font-size:12.0pt;
font-family:"Times New Roman",serif;}
span.EmailStyle19
{mso-style-type:personal-reply;
font-family:"Calibri",sans-serif;
color:#1F497D;}
.MsoChpDefault
{mso-style-type:export-only;
font-size:10.0pt;}
@page WordSection1
{size:612.0pt 792.0pt;
margin:3.0cm 2.0cm 3.0cm 2.0cm;}
div.WordSection1
{page:WordSection1;}
/* List Definitions */
@list l0
{mso-list-id:514612318;
mso-list-template-ids:-1267984690;}
@list l0:level1
{mso-level-number-format:bullet;
mso-level-text:\F0B7;
mso-level-tab-stop:36.0pt;
mso-level-number-position:left;
text-indent:-18.0pt;
mso-ansi-font-size:10.0pt;
font-family:Symbol;}
@list l0:level2
{mso-level-number-format:bullet;
mso-level-text:o;
mso-level-tab-stop:72.0pt;
mso-level-number-position:left;
text-indent:-18.0pt;
mso-ansi-font-size:10.0pt;
font-family:"Courier New";
mso-bidi-font-family:"Times New Roman";}
@list l0:level3
{mso-level-number-format:bullet;
mso-level-text:\F0A7;
mso-level-tab-stop:108.0pt;
mso-level-number-position:left;
text-indent:-18.0pt;
mso-ansi-font-size:10.0pt;
font-family:Wingdings;}
@list l0:level4
{mso-level-number-format:bullet;
mso-level-text:\F0A7;
mso-level-tab-stop:144.0pt;
mso-level-number-position:left;
text-indent:-18.0pt;
mso-ansi-font-size:10.0pt;
font-family:Wingdings;}
@list l0:level5
{mso-level-number-format:bullet;
mso-level-text:\F0A7;
mso-level-tab-stop:180.0pt;
mso-level-number-position:left;
text-indent:-18.0pt;
mso-ansi-font-size:10.0pt;
font-family:Wingdings;}
@list l0:level6
{mso-level-number-format:bullet;
mso-level-text:\F0A7;
mso-level-tab-stop:216.0pt;
mso-level-number-position:left;
text-indent:-18.0pt;
mso-ansi-font-size:10.0pt;
font-family:Wingdings;}
@list l0:level7
{mso-level-number-format:bullet;
mso-level-text:\F0A7;
mso-level-tab-stop:252.0pt;
mso-level-number-position:left;
text-indent:-18.0pt;
mso-ansi-font-size:10.0pt;
font-family:Wingdings;}
@list l0:level8
{mso-level-number-format:bullet;
mso-level-text:\F0A7;
mso-level-tab-stop:288.0pt;
mso-level-number-position:left;
text-indent:-18.0pt;
mso-ansi-font-size:10.0pt;
font-family:Wingdings;}
@list l0:level9
{mso-level-number-format:bullet;
mso-level-text:\F0A7;
mso-level-tab-stop:324.0pt;
mso-level-number-position:left;
text-indent:-18.0pt;
mso-ansi-font-size:10.0pt;
font-family:Wingdings;}
@list l1
{mso-list-id:933517568;
mso-list-template-ids:728898612;}
@list l1:level1
{mso-level-number-format:bullet;
mso-level-text:\F0B7;
mso-level-tab-stop:36.0pt;
mso-level-number-position:left;
text-indent:-18.0pt;
mso-ansi-font-size:10.0pt;
font-family:Symbol;}
@list l1:level2
{mso-level-number-format:bullet;
mso-level-text:o;
mso-level-tab-stop:72.0pt;
mso-level-number-position:left;
text-indent:-18.0pt;
mso-ansi-font-size:10.0pt;
font-family:"Courier New";
mso-bidi-font-family:"Times New Roman";}
@list l1:level3
{mso-level-number-format:bullet;
mso-level-text:\F0A7;
mso-level-tab-stop:108.0pt;
mso-level-number-position:left;
text-indent:-18.0pt;
mso-ansi-font-size:10.0pt;
font-family:Wingdings;}
@list l1:level4
{mso-level-number-format:bullet;
mso-level-text:\F0A7;
mso-level-tab-stop:144.0pt;
mso-level-number-position:left;
text-indent:-18.0pt;
mso-ansi-font-size:10.0pt;
font-family:Wingdings;}
@list l1:level5
{mso-level-number-format:bullet;
mso-level-text:\F0A7;
mso-level-tab-stop:180.0pt;
mso-level-number-position:left;
text-indent:-18.0pt;
mso-ansi-font-size:10.0pt;
font-family:Wingdings;}
@list l1:level6
{mso-level-number-format:bullet;
mso-level-text:\F0A7;
mso-level-tab-stop:216.0pt;
mso-level-number-position:left;
text-indent:-18.0pt;
mso-ansi-font-size:10.0pt;
font-family:Wingdings;}
@list l1:level7
{mso-level-number-format:bullet;
mso-level-text:\F0A7;
mso-level-tab-stop:252.0pt;
mso-level-number-position:left;
text-indent:-18.0pt;
mso-ansi-font-size:10.0pt;
font-family:Wingdings;}
@list l1:level8
{mso-level-number-format:bullet;
mso-level-text:\F0A7;
mso-level-tab-stop:288.0pt;
mso-level-number-position:left;
text-indent:-18.0pt;
mso-ansi-font-size:10.0pt;
font-family:Wingdings;}
@list l1:level9
{mso-level-number-format:bullet;
mso-level-text:\F0A7;
mso-level-tab-stop:324.0pt;
mso-level-number-position:left;
text-indent:-18.0pt;
mso-ansi-font-size:10.0pt;
font-family:Wingdings;}
@list l2
{mso-list-id:1018508388;
mso-list-template-ids:650264630;}
@list l2:level1
{mso-level-number-format:bullet;
mso-level-text:\F0B7;
mso-level-tab-stop:36.0pt;
mso-level-number-position:left;
text-indent:-18.0pt;
mso-ansi-font-size:10.0pt;
font-family:Symbol;}
@list l2:level2
{mso-level-number-format:bullet;
mso-level-text:o;
mso-level-tab-stop:72.0pt;
mso-level-number-position:left;
text-indent:-18.0pt;
mso-ansi-font-size:10.0pt;
font-family:"Courier New";
mso-bidi-font-family:"Times New Roman";}
@list l2:level3
{mso-level-number-format:bullet;
mso-level-text:\F0A7;
mso-level-tab-stop:108.0pt;
mso-level-number-position:left;
text-indent:-18.0pt;
mso-ansi-font-size:10.0pt;
font-family:Wingdings;}
@list l2:level4
{mso-level-number-format:bullet;
mso-level-text:\F0A7;
mso-level-tab-stop:144.0pt;
mso-level-number-position:left;
text-indent:-18.0pt;
mso-ansi-font-size:10.0pt;
font-family:Wingdings;}
@list l2:level5
{mso-level-number-format:bullet;
mso-level-text:\F0A7;
mso-level-tab-stop:180.0pt;
mso-level-number-position:left;
text-indent:-18.0pt;
mso-ansi-font-size:10.0pt;
font-family:Wingdings;}
@list l2:level6
{mso-level-number-format:bullet;
mso-level-text:\F0A7;
mso-level-tab-stop:216.0pt;
mso-level-number-position:left;
text-indent:-18.0pt;
mso-ansi-font-size:10.0pt;
font-family:Wingdings;}
@list l2:level7
{mso-level-number-format:bullet;
mso-level-text:\F0A7;
mso-level-tab-stop:252.0pt;
mso-level-number-position:left;
text-indent:-18.0pt;
mso-ansi-font-size:10.0pt;
font-family:Wingdings;}
@list l2:level8
{mso-level-number-format:bullet;
mso-level-text:\F0A7;
mso-level-tab-stop:288.0pt;
mso-level-number-position:left;
text-indent:-18.0pt;
mso-ansi-font-size:10.0pt;
font-family:Wingdings;}
@list l2:level9
{mso-level-number-format:bullet;
mso-level-text:\F0A7;
mso-level-tab-stop:324.0pt;
mso-level-number-position:left;
text-indent:-18.0pt;
mso-ansi-font-size:10.0pt;
font-family:Wingdings;}
ol
{margin-bottom:0cm;}
ul
{margin-bottom:0cm;}
--></style><!--[if gte mso 9]><xml>
<o:shapedefaults v:ext="edit" spidmax="1026" />
</xml><![endif]--><!--[if gte mso 9]><xml>
<o:shapelayout v:ext="edit">
<o:idmap v:ext="edit" data="1" />
</o:shapelayout></xml><![endif]-->
</head>
<body lang="DA" link="blue" vlink="purple">
<div class="WordSection1">
<p class="MsoNormal"><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D;mso-fareast-language:EN-US">See my
<span style="background:yellow;mso-highlight:yellow">TLR</span> comments below<o:p></o:p></span></p>
<p class="MsoNormal"><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D;mso-fareast-language:EN-US"><o:p> </o:p></span></p>
<div>
<div style="border:none;border-top:solid #E1E1E1 1.0pt;padding:3.0pt 0cm 0cm 0cm">
<p class="MsoNormal"><b><span style="font-size:11.0pt;font-family:"Calibri",sans-serif">From:</span></b><span style="font-size:11.0pt;font-family:"Calibri",sans-serif"> NetarchiveSuite-users <netarchivesuite-users-bounces@ml.sbforge.org>
<b>On Behalf Of </b>Peter Svanberg<br>
<b>Sent:</b> Thursday, December 12, 2019 11:40 PM<br>
<b>To:</b> netarchivesuite-users@ml.sbforge.org<br>
<b>Subject:</b> Re: [Netarchivesuite-users] Your URI/sec and KB/sec figures?; Deduplication<o:p></o:p></span></p>
</div>
</div>
<p class="MsoNormal"><o:p> </o:p></p>
<div id="divtagdefaultwrapper">
<div>
<p><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D">Hello!</span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D"> </span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D">I return to this subject. Have I understood right?</span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p> </o:p></span></p>
<p><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D"> </span><span style="font-family:"Calibri",sans-serif;color:black">Deduplication requires:</span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<ul type="disc">
<li class="MsoNormal" style="color:black;mso-margin-top-alt:auto;mso-margin-bottom-alt:auto;mso-list:l0 level1 lfo1">
<span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D">Settings for harvester.indexserver.* and harvester.harvesting.deduplication.enabled = true in settings for HarvestJobManagerApplication on the admin host.</span><span style="font-family:"Calibri",sans-serif"><o:p></o:p></span></li><li class="MsoNormal" style="color:black;mso-margin-top-alt:auto;mso-margin-bottom-alt:auto;mso-list:l0 level1 lfo1">
<span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D"><ref bean…> and <bean id=”DeDuplicator …> elements in the template, containing %{DEDUPLICATION_INDEX_LOCATION_PLACEHOLDER}:</span><span style="font-family:"Calibri",sans-serif"><o:p></o:p></span></li></ul>
<div>
<p class="MsoNormal"><span lang="EN-GB" style="font-family:"Calibri",sans-serif;color:#1F497D"><o:p> </o:p></span></p>
</div>
<div>
<p class="MsoNormal"><span lang="EN-GB" style="font-family:"Calibri",sans-serif;color:#1F497D"><bean id="DeDuplicator" class="is.hi.bok.deduplicator.DeDuplicator"><o:p></o:p></span></p>
</div>
<div>
<p class="MsoNormal"><span lang="EN-GB" style="font-family:"Calibri",sans-serif;color:#1F497D"> <property name="indexLocation" value="%{DEDUPLICATION_INDEX_LOCATION_PLACEHOLDER}"/> <o:p></o:p></span></p>
</div>
<div>
<p class="MsoNormal"><span lang="EN-GB" style="font-family:"Calibri",sans-serif;color:#1F497D"> <property name="matchingMethod" value="URL"/> <o:p></o:p></span></p>
</div>
<div>
<p class="MsoNormal"><span lang="EN-GB" style="font-family:"Calibri",sans-serif;color:#1F497D"> <property name="tryEquivalent" value="TRUE"/> <o:p></o:p></span></p>
</div>
<div>
<p class="MsoNormal"><span lang="EN-GB" style="font-family:"Calibri",sans-serif;color:#1F497D"> <property name="changeContentSize" value="false"/><o:p></o:p></span></p>
</div>
<div>
<p class="MsoNormal"><span lang="EN-GB" style="font-family:"Calibri",sans-serif;color:#1F497D"> <property name="mimeFilter" value="^text/.*"/><o:p></o:p></span></p>
</div>
<div>
<p class="MsoNormal"><span lang="EN-GB" style="font-family:"Calibri",sans-serif;color:#1F497D"> <property name="filterMode" value="BLACKLIST"/><o:p></o:p></span></p>
</div>
<div>
<p class="MsoNormal"><span lang="EN-GB" style="font-family:"Calibri",sans-serif;color:#1F497D"><!-- <property name="analyze-modes" value="TIMESTAMP"/> Does not work? --> <o:p></o:p></span></p>
<p class="MsoNormal"><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D;background:yellow;mso-highlight:yellow">TLR: We are not using “analyze-modes”</span><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D"><o:p></o:p></span></p>
</div>
<div>
<p class="MsoNormal"><span lang="EN-GB" style="font-family:"Calibri",sans-serif;color:#1F497D"> <property name="origin" value=""/><o:p></o:p></span></p>
</div>
<div>
<p class="MsoNormal"><span lang="EN-GB" style="font-family:"Calibri",sans-serif;color:#1F497D"> <property name="originHandling" value="INDEX"/><o:p></o:p></span></p>
</div>
<div>
<p class="MsoNormal"><span lang="EN-GB" style="font-family:"Calibri",sans-serif;color:#1F497D"> <property name="statsPerHost" value="true"/><o:p></o:p></span></p>
</div>
<div>
<p class="MsoNormal"><span lang="EN-GB" style="font-family:"Calibri",sans-serif;color:#1F497D"> <property name="revisitInWarcs" value="true"/><o:p></o:p></span></p>
<p class="MsoNormal"><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D;background:yellow;mso-highlight:yellow">TLR: We have not set the
</span><span lang="EN-GB" style="font-family:"Calibri",sans-serif;color:#1F497D;background:yellow;mso-highlight:yellow">revisitInWarcs here in our templates – but we have revisits in our warc files – so perhaps it is set in the source code</span><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D">”<o:p></o:p></span></p>
</div>
<div>
<p class="MsoNormal"><span lang="EN-GB" style="font-family:"Calibri",sans-serif;color:#1F497D"> <property name="enabled" value="%{DEDUPLICATION_ENABLED_PLACEHOLDER}" /><o:p></o:p></span></p>
</div>
<div>
<p class="MsoNormal"><span lang="EN-GB" style="font-family:"Calibri",sans-serif;color:#1F497D"></bean> <o:p></o:p></span></p>
</div>
<div>
<p class="MsoNormal"><span style="font-family:"Calibri",sans-serif;color:black"><o:p> </o:p></span></p>
</div>
<p style="margin-left:36.0pt"><b><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D">NOTE</span></b><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D">: The source code and the wiki
documentation shows </span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D">dash-delimited names but it seems it should be these camel case type names. Strange! Is the source code (for DeDuplicator.java) on Github older than the
binaries?</span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p><span lang="EN-US" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D;background:yellow;mso-highlight:yellow">TLR: The developerteam (Colin?) has to answer you</span><span lang="EN-US" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D">
<o:p></o:p></span></p>
<p style="margin-left:36.0pt"><span lang="EN-US" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p> </o:p></span></p>
<ul type="disc">
<li class="MsoNormal" style="color:black;mso-margin-top-alt:auto;mso-margin-bottom-alt:auto;mso-list:l1 level1 lfo2">
<span lang="EN-GB" style="font-family:"Calibri",sans-serif;color:#1F497D"> </span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D">If you haven’t run with deduplication enabled in previous harvests, the luzene indexes are generated
at the start of the first snapshot harvest where it is enabled. True?</span><span style="font-family:"Calibri",sans-serif"><o:p></o:p></span></li></ul>
<p><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D;background:yellow;mso-highlight:yellow">TLR: Yes – and there must also be a previous harvest where the jobstatus for the different jobs are different from NEW, SUBMITTED,
RESUBMITTED and some other I don’t remember. It is ok if they are DONE or FAILED.</span><span lang="EN-US" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D">And a question:</span><span lang="EN-US" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<ul type="disc">
<li class="MsoNormal" style="color:black;mso-margin-top-alt:auto;mso-margin-bottom-alt:auto;mso-list:l2 level1 lfo3">
<span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D">You said you move the index files manually. Do you let the files be distributed to the first harvest host and then copy a whole directory to the other hosts? Or what?</span><span style="font-family:"Calibri",sans-serif"><o:p></o:p></span></li></ul>
<p class="MsoNormal" style="mso-margin-top-alt:auto;mso-margin-bottom-alt:auto"><span lang="EN-US" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D;background:yellow;mso-highlight:yellow">TLR: typically I start 1 harvester instance on
one harvester server and wait until it is up on running ( It does the copy of the index folder and gunzips it automatically
</span><span lang="EN-US" style="font-size:11.0pt;font-family:Wingdings;color:#1F497D;background:yellow;mso-highlight:yellow">J</span><span lang="EN-US" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D;background:yellow;mso-highlight:yellow">).
Then I start all the harvester instances in one shut on the same server, because they can reuse the index already ready copied to the PROD/cache dir. The same procedure on all five harvest servers - one server at time….The manual copy of the whole lucine folder
and relinking is only when the index in step 2 is totally crashed and I ‘m forced to reuse the generated index from step 1.</span><span lang="EN-US" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D"><o:p></o:p></span></p>
<p><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D"> </span><span lang="EN-US" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D">Regards!<o:p></o:p></span></p>
<p><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D"> <o:p></o:p></span></p>
<p><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D">Peter Svanberg, Sweden<o:p></o:p></span></p>
<p><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D"> </span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D"> </span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<div>
<div style="border:none;border-top:solid #E1E1E1 1.0pt;padding:3.0pt 0cm 0cm 0cm">
<p><b><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black">Från:</span></b><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"> NetarchiveSuite-users <<a href="mailto:netarchivesuite-users-bounces@ml.sbforge.org">netarchivesuite-users-bounces@ml.sbforge.org</a>>
<b>För </b>Tue Hejlskov Larsen<br>
<b>Skickat:</b> den 26 juni 20</span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black">19 21:28<br>
<b>Till:</b> <a href="mailto:netarchivesuite-users@ml.sbforge.org">netarchivesuite-users@ml.sbforge.org</a><br>
<b>Ämne:</b> Re: [Netarchivesuite-users] Your URI/sec and KB/sec figures?; Deduplication<o:p></o:p></span></p>
</div>
</div>
<p><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"> <o:p></o:p></span></p>
<p><span lang="EN-US" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D">We are using the NAS Indexserver and there is
<b>no database</b> – only a cached filesystem with generated lucene gz indexes.</span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p><span lang="EN-US" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D">Every time a new harvest is generated a new index-server job is generated by the Indexserver to collect only the
<b>previous harvest job crawllogs</b> files and cdxfiles included in the metadata warc files for each harvest. Out of that is a lucene gz index generated in the cache directory. The next step is that the jobs in the harvest are generated by the job database
and picked up by the active e.g. broad crawl harvesters. They request the index in the Index server cache and it is copied to the actual harvester server cache and used by all harvesters on that server. So here it is very important to only have 1 broad crawl
harvester active in the beginning – otherwise it will bomb your ftp server on that server. Remember to do the same for each broad crawl server. So when the harvester have the cached index on the local harvester server it starts running and each harvested
url is compared with by url and checkum in the cached lucene index. If it is already there it is removed and the duplicate is annotated in the crawllog and a revisit is made.</span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p><span lang="EN-US" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D"> </span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p><span lang="EN-US" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D">After the job is finished we have a NAS Waybackindexer and Aggregater which runs every day. It uses a local derby database to manage which files are CDX indexed. The
Waybackindexer runs a complete archive file list and compare it with the derby database and all new files are indexet using the job metadata warc file included cdx files and the crawllog dublicate annotations to generate a complete CDX for all harvested object
also the duplicates in the job warcfiles. We are not using the revisits and the OpenWayback cdx indexserver.</span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p><span lang="EN-US" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D"> </span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p><span lang="EN-US" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D">Best regards</span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p><span lang="EN-US" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D">Tue
</span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p><span lang="EN-US" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D"> </span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<div>
<div style="border:none;border-top:solid #E1E1E1 1.0pt;padding:3.0pt 0cm 0cm 0cm">
<p><b><span lang="EN-US" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black">From:</span></b><span lang="EN-US" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"> NetarchiveSuite-users <</span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><a href="mailto:netarchivesuite-users-bounces@ml.sbforge.org"><span lang="EN-US" style="color:#0563C1">netarchivesuite-users-bounces@ml.sbforge.org</span></a></span><span lang="EN-US" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black">>
<b>On Behalf Of </b>Peter Svanberg<br>
<b>Sent:</b> Wedne</span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black">sday, June 26, 2019 7:10 PM<br>
<b>To:</b> <a href="mailto:netarchivesuite-users@ml.sbforge.org"><span style="color:#0563C1">netarchivesuite-users@ml.sbforge.org</span></a><br>
<b>Subject:</b> Re: [Netarchivesuite-users] Your URI/sec and KB/sec figures?; Deduplication<o:p></o:p></span></p>
</div>
</div>
<p><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"> <o:p></o:p></span></p>
<p><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D">Deduplication: You use is.hi.bok.deduplicator.DeDuplicator, which (if I understand correct):</span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D"> </span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p style="margin-left:36.0pt;text-indent:-18.0pt"><span lang="EN-GB" style="font-size:11.0pt;font-family:Symbol;color:#1F497D">·</span><span lang="EN-GB" style="font-size:7.0pt;color:#1F497D">
</span><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D">Saves visited 2xx-URL:s with its checksum value in a Lucine index database.</span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p style="margin-left:36.0pt;text-indent:-18.0pt"><span lang="EN-GB" style="font-size:11.0pt;font-family:Symbol;color:#1F497D">·</span><span lang="EN-GB" style="font-size:7.0pt;color:#1F497D">
</span><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D">Decides equality via checksum and can treat any two fetched
</span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><span class="MsoHyperlink"><span lang="EN-GB" style="color:#0563C1">URL:s</span></span></span><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D">
as equal.</span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p style="margin-left:36.0pt;text-indent:-18.0pt"><span lang="EN-GB" style="font-size:11.0pt;font-family:Symbol;color:#1F497D">·</span><span lang="EN-GB" style="font-size:7.0pt;color:#1F497D">
</span><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D">The index database can contain data from several earlier crawls, not only the current.</span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D"> </span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D">You limit the use (perhaps via default parameter values) to</span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p style="margin-left:36.0pt;text-indent:-18.0pt"><span lang="EN-GB" style="font-size:11.0pt;font-family:Symbol;color:#1F497D">·</span><span lang="EN-GB" style="font-size:7.0pt;color:#1F497D">
</span><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D">Just consider same URLs, or “equivalent” URL (same domain and path but different www[0-9] hostname prefix)
</span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p style="margin-left:36.0pt;text-indent:-18.0pt"><span lang="EN-GB" style="font-size:11.0pt;font-family:Symbol;color:#1F497D">·</span><span lang="EN-GB" style="font-size:7.0pt;color:#1F497D">
</span><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D">Don’t apply to mime-type text/*.</span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D"> </span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D">Do you use this on a broad crawl? With one index database for the whole process, being updated from all harvesters?</span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D"> </span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D">Do you get 40–50 % data reduction on a broad crawl with this limits? Okay, many large
</span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><span class="MsoHyperlink"><span lang="EN-GB" style="color:#0563C1">URL:s</span></span></span><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D">
have links from many other </span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><span class="MsoHyperlink"><span lang="EN-GB" style="color:#0563C1">URL:s</span></span></span><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D">,
I suppose.</span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D"> </span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D">Have you considered not using those limits? Are there performance losses in having to many revisit records?</span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D"> </span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D">I suppose there is an equivalent module for OpenWayback which use the same index database to find the warc file for the record pointed out by a revisit record? And
also for pyweb?</span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D"> </span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D">Regards!</span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D"> </span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D">Peter Svanberg</span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D"> </span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<div>
<div style="border:none;border-top:solid #E1E1E1 1.0pt;padding:3.0pt 0cm 0cm 0cm">
<p><b><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black">Från:</span></b><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"> NetarchiveSuite-users <</span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><a href="mailto:netarchivesuite-users-bounces@ml.sbforge.org"><span lang="EN-GB" style="color:#0563C1">netarchivesuite-users-bounces@ml.sbforge.org</span></a></span><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black">>
<b>För </b>Tue Hejlskov Larsen<br>
<b>Skickat:</b> den 24 juni 2019 21:26<br>
<b>Till:</b> </span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><a href="mailto:netarchivesuite-users@ml.sbforge.org"><span lang="EN-GB" style="color:#0563C1">netarchivesuite-users@ml.sbforge.org</span></a></span><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><br>
<b>Ämne:</b> Re: [Netarchivesuite-users] Your URI/sec and KB/sec figures?</span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
</div>
</div>
<p><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"> </span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p><span lang="EN-US" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D">See my TLR comments below</span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p><span lang="EN-US" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D"> </span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<div>
<div style="border:none;border-top:solid #E1E1E1 1.0pt;padding:3.0pt 0cm 0cm 0cm">
<p><b><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black">From:</span></b><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"> NetarchiveSuite-users <<a href="mailto:netarchivesuite-users-bounces@ml.sbforge.org"><span style="color:#0563C1">netarchivesuite-users-bounces@ml.sbforge.org</span></a>>
<b>On Behalf Of </b>Peter Svanberg<br>
<b>Sent:</b> Monday, June 24, 2019 5:05 PM<br>
<b>To:</b> <a href="mailto:netarchivesuite-users@ml.sbforge.org"><span style="color:#0563C1">netarchivesuite-users@ml.sbforge.org</span></a><br>
<b>Subject:</b> Re: [Netarchivesuite-users] Your URI/sec and KB/sec figures?<o:p></o:p></span></p>
</div>
</div>
<p><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"> <o:p></o:p></span></p>
<p><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D">I continue with my curiosity, I hope it’s OK.</span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D"> </span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D">You mean about 70 TByte fetched in about 100–120 days? (Or was the selective “mega” included in 70?)</span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D">TLR>>> yes we run both broadcrawl step 1 and step 2 and selective broad crawl jobs in parallel, because we have 2 dedicated harvester farms in AAR and CPH.</span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D"> </span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D">And 20 TByte is what ends up being to stored in the archive?
</span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D"> </span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D">TLR>>> yes</span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D"> </span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D">Approximately how many URI:s does this correspond to – before and after deduplication?</span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D"> </span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D">TLR >>> We are talking million/billion urls, - just to mention - we have 1.1 billion 5003 “byte limit reached” in step 2. All urls are recorded in the crawllogs also
the deduplicated are annotated there. </span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D">So you need to specify which type of return code urls you want numbers for
</span><span lang="EN-GB" style="font-size:11.0pt;font-family:Wingdings;color:#1F497D">J</span><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D">. We do some daily statistics for some of the return codes. It is really
huge numbers!</span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D"> </span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D">TLR>>>Deduplication gives appr. 40-50 % and gz 40-50 %. </span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D"> </span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D">We have been advised to do the broad crawl in several steps with increasing max thresholds, is that what you do in 2 steps? With what thresholds different (and what
levels)?</span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D"> </span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D">TLR>>> yes we have following steps:</span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p style="margin-left:36.0pt;text-indent:-18.0pt"><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D">1)</span><span lang="EN-GB" style="font-size:7.0pt;color:#1F497D">
</span><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D">Step 1: 50 MB byte limit : all domains in the jobs database : duration 1-2 weeks</span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D">We do some sqlextraction from the jobdatabase and increase the max bytelimit per domain for some 10- 20.000 domains before each new step 2 broad crawl.</span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p style="margin-left:36.0pt;text-indent:-18.0pt"><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D">2)</span><span lang="EN-GB" style="font-size:7.0pt;color:#1F497D">
</span><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D">Step 2: 16 GB byte limit: all domains which have hit 50 MB limit: duration 1-2 months.</span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p style="margin-left:36.0pt;text-indent:-18.0pt"><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D">3)</span><span lang="EN-GB" style="font-size:7.0pt;color:#1F497D">
</span><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D">And we run about 3-4 big selective broad crawl harvests in parallel - with different (huge) domains - taken out of the step 2 broad crawl. They run 3 – 6 weeks
each and each harvest creates about 10 -20 jobs running in parallel in AAR together with the normal daily selective harvests.
</span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D">So we are using the most of our harvester capacity in long periods during the “broad crawl”.
</span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D">
</span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D">And with reference to the subject line … what is your typical URI/sec and KB/sec figures in a single job?</span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D"> </span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D">TLR>>> I have not looked into that ( we have between 50-90 different jobs /day) , because we are using the NAS std. setup ( you have got a copy of that earlier).
</span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D">The main problem was earlier - domains which blocked/throttled us and next that our capacity agreements with the biggest webhotels where too low. The biggest one (
a .be company) have about 50-75 % of all .dk domains.</span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D">After we have increased our max concurrent requests agreements with them to 40MB/sec for our harvester ip ranges in AAR and CPH and upgraded to NAS 5.5 we have no
big performance issues anymore.</span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D"> </span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D"> </span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D">Med venlig hilsen</span><span lang="EN-GB" style="font-size:9.0pt;font-family:"Arial",sans-serif;color:black"><br>
<br>
</span><span lang="EN-GB" style="font-size:11.0pt;font-family:"Arial",sans-serif;color:black">Peter Svanberg</span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D"> </span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<div>
<div style="border:none;border-top:solid #E1E1E1 1.0pt;padding:3.0pt 0cm 0cm 0cm">
<p><b><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black">Från:</span></b><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"> NetarchiveSuite-users <</span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><a href="mailto:netarchivesuite-users-bounces@ml.sbforge.org"><span lang="EN-GB" style="color:#0563C1">netarchivesuite-users-bounces@ml.sbforge.org</span></a></span><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black">>
<b>För </b>Tue Hejlskov Larsen<br>
<b>Skickat:</b> den 24 juni 2019 15:16<br>
<b>Till:</b> </span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><a href="mailto:netarchivesuite-users@ml.sbforge.org"><span lang="EN-GB" style="color:#0563C1">netarchivesuite-users@ml.sbforge.org</span></a></span><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><br>
<b>Ämne:</b> Re: [Netarchivesuite-users] Your URI/sec and KB/sec figures?</span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
</div>
</div>
<p><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"> </span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p><span lang="EN-US" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D">The 70 TB is based on NAS GUI/crawllog numbers – and before deduplication and gz – about 20 TB gz uploaded.</span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p><span lang="EN-US" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D"> </span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p><span lang="EN-US" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D">“A broadcrawl” runs about 2 - 2 1/2 months - and we do some job follow up during step 2 ( this part takes about 1 ½ month) and the selective broad crawl job “mega
big sites” (runs for a month or more and here we use another queue assign policy and much lower delays and harvest only domains which can take a huge number of crawling requests!) </span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p><span lang="EN-US" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D"> </span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p><span lang="EN-US" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D">Best regards</span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p><span lang="EN-US" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D">Tue</span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p><span lang="EN-US" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D"> </span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<div>
<div style="border:none;border-top:solid #E1E1E1 1.0pt;padding:3.0pt 0cm 0cm 0cm">
<p><b><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black">From:</span></b><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"> NetarchiveSuite-users <<a href="mailto:netarchivesuite-users-bounces@ml.sbforge.org"><span style="color:#0563C1">netarchivesuite-users-bounces@ml.sbforge.org</span></a>>
<b>On Behalf Of </b>Peter Svanberg<br>
<b>Sent:</b> Monday, June 24, 2019 2:54 PM<br>
<b>To:</b> <a href="mailto:netarchivesuite-users@ml.sbforge.org"><span style="color:#0563C1">netarchivesuite-users@ml.sbforge.org</span></a><br>
<b>Subject:</b> Re: [Netarchivesuite-users] Your URI/sec and KB/sec figures?<o:p></o:p></span></p>
</div>
</div>
<p><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"> <o:p></o:p></span></p>
<p><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D">Thank you Tue, this is very interesting information!</span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D"> </span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D">About 70 TB in how many days?</span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D"> </span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D">You emphasize “harvested” – do you aim at that more data is downloaded but not archived (sorted out duplicates/irrelevant?)?</span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D"> </span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D">I’ll return when I have gathered corresponding info on our environment.</span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D"> </span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D">Regards,</span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D"> </span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p><span lang="EN-GB" style="font-size:9.0pt;font-family:"Arial",sans-serif;color:black">-----<br>
<br>
</span><span lang="EN-GB" style="font-size:11.0pt;font-family:"Arial",sans-serif;color:black">Peter Svanberg</span><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D"><br>
</span><span lang="EN-GB" style="font-size:9.0pt;font-family:"Calibri",sans-serif;color:#1F497D"><br>
</span><span lang="EN-GB" style="font-size:11.0pt;font-family:"Arial",sans-serif;color:#1F497D">National Library of Sweden</span><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D"><br>
</span><span lang="EN-GB" style="font-size:9.0pt;font-family:"Arial",sans-serif;color:#1F497D">Phone: +46 10 709
</span><span lang="EN-GB" style="font-size:9.0pt;font-family:"Arial",sans-serif;color:black">32 78</span><span lang="EN-GB" style="font-size:9.0pt;font-family:"Calibri",sans-serif;color:#1F497D"><br>
<br>
</span><span lang="EN-GB" style="font-size:9.0pt;font-family:"Arial",sans-serif;color:#1F497D">E-mail</span><span lang="EN-GB" style="font-size:9.0pt;font-family:"Calibri",sans-serif;color:#1F497D">:
</span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><a href="mailto:peter.svanberg@kb.se"><span lang="EN-GB" style="font-size:9.0pt;font-family:"Arial",sans-serif;color:#0563C1">peter.svanberg@kb.se</span></a></span><span lang="EN-GB" style="font-size:9.0pt;font-family:"Calibri",sans-serif;color:black"><br>
</span><span lang="EN-GB" style="font-size:9.0pt;font-family:"Arial",sans-serif;color:#1F497D">Web</span><span lang="EN-GB" style="font-size:9.0pt;font-family:"Calibri",sans-serif;color:#1F497D">:
</span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><a href="http://www.kb.se"><span lang="EN-GB" style="font-size:9.0pt;font-family:"Arial",sans-serif;color:#0563C1">www.kb.se</span></a><o:p></o:p></span></p>
<p><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D"> </span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<div>
<div style="border:none;border-top:solid #E1E1E1 1.0pt;padding:3.0pt 0cm 0cm 0cm">
<p><b><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black">Från:</span></b><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"> NetarchiveSuite-users <</span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><a href="mailto:netarchivesuite-users-bounces@ml.sbforge.org"><span lang="EN-GB" style="color:#0563C1">netarchivesuite-users-bounces@ml.sbforge.org</span></a></span><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black">>
<b>För </b>Tue Hejlskov Larsen<br>
<b>Skickat:</b> den 24 juni 2019 12:22<br>
<b>Till:</b> </span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><a href="mailto:netarchivesuite-users@ml.sbforge.org"><span lang="EN-GB" style="color:#0563C1">netarchivesuite-users@ml.sbforge.org</span></a></span><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><br>
<b>Ämne:</b> Re: [Netarchivesuite-users] Your URI/sec and KB/sec figures?</span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
</div>
</div>
<p><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"> </span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p><span lang="EN-US" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D">Hi Peter</span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p><span lang="EN-US" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D"> </span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p><span lang="EN-US" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D">We have currently only minor performance issues during harvesting. We have almost finished with our 2. broadcrawl this year – it will end up between 60-70 TB
<i>harvested</i> pages.</span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p><span lang="EN-US" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D">Our harvesting capacity is 90-100 Heritrix harvesters including some virtual Umbra harvesters…</span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p><span lang="EN-US" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D">We are using physical servers for the broadcrawl harvesters and virtual servers for selective harvesters.</span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p><span lang="EN-US" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D">The 5 physical servers have each:</span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p><span lang="EN-US" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D">32 G MEM, 24 CPU’s, 4 TB local storage</span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p><span lang="EN-US" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D">The 5 Virtual servers using NFS:</span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p><span lang="EN-US" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D">20 G RAM, 8 CPU’s and 3 TB NFS storage</span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p><span lang="EN-US" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D">On each server we have between 8-10 Heritrix instances running – withdrawn the Umbra harvesters which only have one per server.</span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p><span lang="EN-US" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D">Between the harvester and the www we have a firewall and throttling firewall agreements with about 5 webhotels, because they blocked/throttled our harvesters.</span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p><span lang="EN-US" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D"> </span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p><span lang="EN-US" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D">Best regards</span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p><span lang="EN-US" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D">Tue</span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p><span lang="EN-US" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D"> </span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p><span lang="EN-US" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D"> </span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<div>
<div style="border:none;border-top:solid #E1E1E1 1.0pt;padding:3.0pt 0cm 0cm 0cm">
<p><b><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black">From:</span></b><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"> NetarchiveSuite-users <<a href="mailto:netarchivesuite-users-bounces@ml.sbforge.org"><span style="color:#0563C1">netarchivesuite-users-bounces@ml.sbforge.org</span></a>>
<b>On Behalf Of </b>Peter Svanberg<br>
<b>Sent:</b> Monday, June 24, 2019 11:39 AM<br>
<b>To:</b> <a href="mailto:netarchivesuite-users@ml.sbforge.org"><span style="color:#0563C1">netarchivesuite-users@ml.sbforge.org</span></a><br>
<b>Subject:</b> [Netarchivesuite-users] Your URI/sec and KB/sec figures?<o:p></o:p></span></p>
</div>
</div>
<p><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"> <o:p></o:p></span></p>
<p><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black">Hello!</span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"> </span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black">I discovered a Heritrix mailinglist(*). Amongst some interesting tips on making the crawl faster, I also read some speed figures far from what we ever get. So I ask
you: what do you get as speed values?</span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"> </span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black">Our latest 19 selective harvests have the following figures (from crawl-report.txt in the jobs metadata WARC file):</span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"> </span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black">URIs/sec: slowest job 0,83; fastest job 9,8; average 5,11</span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black">KB/sec: slowest 34; fastest 863; average 313</span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"> </span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black">(I realize that this besides NAS/Heritrix configuration depends much on hardware, memory, disk I/O, network capacity etc. but don’t know which such figures that are
most relevant to add to this comparison. Suggestions?)</span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"> </span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><o:p></o:p></span></p>
<p><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black">*
</span><span style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:black"><a href="https://groups.yahoo.com/neo/groups/archive-crawler/conversations/messages"><span lang="EN-GB" style="color:#0563C1">https://groups.yahoo.com/neo/groups/archive-crawler/conversations/messages</span></a><o:p></o:p></span></p>
</div>
</div>
</div>
</body>
</html>