<html xmlns:v="urn:schemas-microsoft-com:vml" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:w="urn:schemas-microsoft-com:office:word" xmlns:m="http://schemas.microsoft.com/office/2004/12/omml" xmlns="http://www.w3.org/TR/REC-html40">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
<meta name="Generator" content="Microsoft Word 15 (filtered medium)">
<!--[if !mso]><style>v\:* {behavior:url(#default#VML);}
o\:* {behavior:url(#default#VML);}
w\:* {behavior:url(#default#VML);}
.shape {behavior:url(#default#VML);}
</style><![endif]--><style><!--
/* Font Definitions */
@font-face
        {font-family:Wingdings;
        panose-1:5 0 0 0 0 0 0 0 0 0;}
@font-face
        {font-family:"Cambria Math";
        panose-1:2 4 5 3 5 4 6 3 2 4;}
@font-face
        {font-family:Calibri;
        panose-1:2 15 5 2 2 2 4 3 2 4;}
@font-face
        {font-family:Consolas;
        panose-1:2 11 6 9 2 2 4 3 2 4;}
/* Style Definitions */
p.MsoNormal, li.MsoNormal, div.MsoNormal
        {margin:0cm;
        margin-bottom:.0001pt;
        font-size:12.0pt;
        font-family:"Times New Roman",serif;}
a:link, span.MsoHyperlink
        {mso-style-priority:99;
        color:blue;
        text-decoration:underline;}
a:visited, span.MsoHyperlinkFollowed
        {mso-style-priority:99;
        color:purple;
        text-decoration:underline;}
p
        {mso-style-priority:99;
        mso-margin-top-alt:auto;
        margin-right:0cm;
        mso-margin-bottom-alt:auto;
        margin-left:0cm;
        font-size:12.0pt;
        font-family:"Times New Roman",serif;}
pre
        {mso-style-priority:99;
        mso-style-link:"HTML - förformaterad Char";
        margin:0cm;
        margin-bottom:.0001pt;
        font-size:10.0pt;
        font-family:"Courier New",serif;}
tt
        {mso-style-priority:99;
        font-family:"Courier New",serif;}
p.MsoListParagraph, li.MsoListParagraph, div.MsoListParagraph
        {mso-style-priority:34;
        margin-top:0cm;
        margin-right:0cm;
        margin-bottom:0cm;
        margin-left:36.0pt;
        margin-bottom:.0001pt;
        font-size:12.0pt;
        font-family:"Times New Roman",serif;}
span.HTML-frformateradChar
        {mso-style-name:"HTML - förformaterad Char";
        mso-style-priority:99;
        mso-style-link:"HTML - förformaterad";
        font-family:Consolas;}
p.msonormal0, li.msonormal0, div.msonormal0
        {mso-style-name:msonormal;
        mso-style-priority:99;
        mso-margin-top-alt:auto;
        margin-right:0cm;
        mso-margin-bottom-alt:auto;
        margin-left:0cm;
        font-size:12.0pt;
        font-family:"Times New Roman",serif;}
span.E-postmall22
        {mso-style-type:personal;
        font-family:"Calibri",sans-serif;
        color:#1F497D;}
p.FormateretHTML, li.FormateretHTML, div.FormateretHTML
        {mso-style-name:"Formateret HTML";
        mso-style-link:"Formateret HTML Tegn";
        margin:0cm;
        margin-bottom:.0001pt;
        font-size:12.0pt;
        font-family:"Times New Roman",serif;}
span.FormateretHTMLTegn
        {mso-style-name:"Formateret HTML Tegn";
        mso-style-priority:99;
        mso-style-link:"Formateret HTML";
        font-family:"Courier New",serif;
        mso-fareast-language:DA;}
span.E-postmall27
        {mso-style-type:personal;
        font-family:"Calibri",sans-serif;
        color:#1F497D;}
span.E-postmall28
        {mso-style-type:personal-compose;
        font-family:"Calibri",sans-serif;
        color:windowtext;}
.MsoChpDefault
        {mso-style-type:export-only;
        font-size:10.0pt;}
@page WordSection1
        {size:612.0pt 792.0pt;
        margin:3.0cm 2.0cm 3.0cm 2.0cm;}
div.WordSection1
        {page:WordSection1;}
/* List Definitions */
@list l0
        {mso-list-id:1404520494;
        mso-list-type:hybrid;
        mso-list-template-ids:-1276323592 1519042836 69009411 69009413 69009409 69009411 69009413 69009409 69009411 69009413;}
@list l0:level1
        {mso-level-start-at:2;
        mso-level-number-format:bullet;
        mso-level-text:;
        mso-level-tab-stop:none;
        mso-level-number-position:left;
        text-indent:-18.0pt;
        font-family:Symbol;
        mso-fareast-font-family:Calibri;
        mso-bidi-font-family:"Times New Roman";}
@list l0:level2
        {mso-level-number-format:bullet;
        mso-level-text:o;
        mso-level-tab-stop:none;
        mso-level-number-position:left;
        text-indent:-18.0pt;
        font-family:"Courier New",serif;}
@list l0:level3
        {mso-level-number-format:bullet;
        mso-level-text:;
        mso-level-tab-stop:none;
        mso-level-number-position:left;
        text-indent:-18.0pt;
        font-family:Wingdings;}
@list l0:level4
        {mso-level-number-format:bullet;
        mso-level-text:;
        mso-level-tab-stop:none;
        mso-level-number-position:left;
        text-indent:-18.0pt;
        font-family:Symbol;}
@list l0:level5
        {mso-level-number-format:bullet;
        mso-level-text:o;
        mso-level-tab-stop:none;
        mso-level-number-position:left;
        text-indent:-18.0pt;
        font-family:"Courier New",serif;}
@list l0:level6
        {mso-level-number-format:bullet;
        mso-level-text:;
        mso-level-tab-stop:none;
        mso-level-number-position:left;
        text-indent:-18.0pt;
        font-family:Wingdings;}
@list l0:level7
        {mso-level-number-format:bullet;
        mso-level-text:;
        mso-level-tab-stop:none;
        mso-level-number-position:left;
        text-indent:-18.0pt;
        font-family:Symbol;}
@list l0:level8
        {mso-level-number-format:bullet;
        mso-level-text:o;
        mso-level-tab-stop:none;
        mso-level-number-position:left;
        text-indent:-18.0pt;
        font-family:"Courier New",serif;}
@list l0:level9
        {mso-level-number-format:bullet;
        mso-level-text:;
        mso-level-tab-stop:none;
        mso-level-number-position:left;
        text-indent:-18.0pt;
        font-family:Wingdings;}
ol
        {margin-bottom:0cm;}
ul
        {margin-bottom:0cm;}
--></style><!--[if gte mso 9]><xml>
<o:shapedefaults v:ext="edit" spidmax="1026" />
</xml><![endif]--><!--[if gte mso 9]><xml>
<o:shapelayout v:ext="edit">
<o:idmap v:ext="edit" data="1" />
</o:shapelayout></xml><![endif]-->
</head>
<body lang="SV" link="blue" vlink="purple">
<div class="WordSection1">
<p class="MsoNormal"><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D;mso-fareast-language:EN-US">Thank you for that. We had increased on the admin host but now we increased on the harvesters also. But we have not seen
 any “too many open files” errors, shouldn’t that show up in some log file?<o:p></o:p></span></p>
<p class="MsoNormal"><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D;mso-fareast-language:EN-US"><o:p> </o:p></span></p>
<p class="MsoNormal"><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D;mso-fareast-language:EN-US">Our scheduling of 1,8 million domains in 180 jobs of 10000 each took about 4 days (5 domains/second). This is step 1,
 i.e. first run (with 500 kByte limit). How many domains did you schedule in 6-9 hours?<o:p></o:p></span></p>
<p class="MsoNormal" style="margin-top:12.0pt"><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D;mso-fareast-language:EN-US">Could you explain more? Do you deliberately delay the scheduling, how? Some NAS parameter?<o:p></o:p></span></p>
<p class="MsoNormal" style="margin-top:12.0pt"><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D;mso-fareast-language:EN-US">Other strange things:<o:p></o:p></span></p>
<p class="MsoListParagraph" style="margin-top:12.0pt;text-indent:-18.0pt;mso-list:l0 level1 lfo1">
<![if !supportLists]><span lang="EN-GB" style="font-size:11.0pt;font-family:Symbol;color:#1F497D;mso-fareast-language:EN-US"><span style="mso-list:Ignore">·<span style="font:7.0pt "Times New Roman"">       
</span></span></span><![endif]><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D;mso-fareast-language:EN-US">The admin host’s openmq process use 200 % cpu also *<b>after</b>* the scheduling is done (and it shouldn’t have
 so much to do).<o:p></o:p></span></p>
<p class="MsoListParagraph" style="margin-top:12.0pt;text-indent:-18.0pt;mso-list:l0 level1 lfo1">
<![if !supportLists]><span lang="EN-GB" style="font-size:11.0pt;font-family:Symbol;color:#1F497D;mso-fareast-language:EN-US"><span style="mso-list:Ignore">·<span style="font:7.0pt "Times New Roman"">       
</span></span></span><![endif]><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D;mso-fareast-language:EN-US">The GUI is only partially updating, and out of sync with reality … when snapshot job n+30 is ready, it reports
 that job n is ready, with date and time from job n+30.<o:p></o:p></span></p>
<p class="MsoListParagraph" style="margin-top:12.0pt;text-indent:-18.0pt;mso-list:l0 level1 lfo1">
<![if !supportLists]><span lang="EN-GB" style="font-size:11.0pt;font-family:Symbol;color:#1F497D;mso-fareast-language:EN-US"><span style="mso-list:Ignore">·<span style="font:7.0pt "Times New Roman"">       
</span></span></span><![endif]><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D;mso-fareast-language:EN-US">Jobs are run on different servers but never more than one on at the time on a server (but with different harvester
 on the server at different times). And it don’t seem to use all the servers either.<o:p></o:p></span></p>
<p class="MsoListParagraph" style="margin-top:12.0pt;text-indent:-18.0pt;mso-list:l0 level1 lfo1">
<![if !supportLists]><span lang="EN-GB" style="font-size:11.0pt;font-family:Symbol;color:#1F497D;mso-fareast-language:EN-US"><span style="mso-list:Ignore">·<span style="font:7.0pt "Times New Roman"">       
</span></span></span><![endif]><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D;mso-fareast-language:EN-US">There is sometimes a lot of delay between “Requested to check the validity of harvest channel 'SNAPSHOT'” and
 “Received message stating that channel 'SNAPSHOT' is valid.”.<o:p></o:p></span></p>
<p class="MsoListParagraph" style="margin-top:12.0pt;text-indent:-18.0pt;mso-list:l0 level1 lfo1">
<![if !supportLists]><span lang="EN-GB" style="font-size:11.0pt;font-family:Symbol;color:#1F497D;mso-fareast-language:EN-US"><span style="mso-list:Ignore">·<span style="font:7.0pt "Times New Roman"">       
</span></span></span><![endif]><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D;mso-fareast-language:EN-US">Sometimes some process thinks that a harvest job is ready and moves files, while the harvest process itself
 continues and gets an exception when it doesn’t find the files to write in.<o:p></o:p></span></p>
<p class="MsoNormal"><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D;mso-fareast-language:EN-US"><o:p> </o:p></span></p>
<p class="MsoNormal"><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D;mso-fareast-language:EN-US">As Sara said, “</span><span lang="DA" style="font-size:10.0pt;font-family:"Arial",sans-serif">About everything should
 happen during the first broad crawl!”<o:p></o:p></span></p>
<p class="MsoNormal"><span lang="DA" style="font-size:10.0pt;font-family:"Arial",sans-serif"><o:p> </o:p></span></p>
<p class="MsoNormal"><span lang="DA" style="font-size:10.0pt;font-family:"Arial",sans-serif">Regards,<o:p></o:p></span></p>
<p class="MsoNormal"><span lang="DA" style="font-size:10.0pt;font-family:"Arial",sans-serif"><o:p> </o:p></span></p>
<p class="MsoNormal"><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D;mso-fareast-language:EN-US"><o:p> </o:p></span></p>
<p class="MsoNormal"><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D;mso-fareast-language:EN-US"><o:p> </o:p></span></p>
<p class="MsoNormal"><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D;mso-fareast-language:EN-US"><o:p> </o:p></span></p>
<div>
<div style="border:none;border-top:solid #E1E1E1 1.0pt;padding:3.0pt 0cm 0cm 0cm">
<p class="MsoNormal"><b><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif">Från:</span></b><span lang="EN-GB" style="font-size:11.0pt;font-family:"Calibri",sans-serif"> NetarchiveSuite-users <netarchivesuite-users-bounces@ml.sbforge.org>
<b>För </b>Tue Hejlskov Larsen<br>
<b>S</b></span><b><span style="font-size:11.0pt;font-family:"Calibri",sans-serif">kickat:</span></b><span style="font-size:11.0pt;font-family:"Calibri",sans-serif"> den 16 september 2019 09:12<br>
<b>Till:</b> netarchivesuite-users@ml.sbforge.org<br>
<b>Ämne:</b> Re: [Netarchivesuite-users] NAS broad crawl questions<o:p></o:p></span></p>
</div>
</div>
<p class="MsoNormal"><o:p> </o:p></p>
<p class="MsoNormal"><span lang="EN-US" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D;mso-fareast-language:EN-US">Our 5 broadcrawl servers ( up to 10 harvesters)/ server have following setup<o:p></o:p></span></p>
<p class="MsoNormal"><span lang="EN-US" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D;mso-fareast-language:EN-US"><o:p> </o:p></span></p>
<p class="MsoNormal"><span lang="EN-US" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D;mso-fareast-language:EN-US">[prod@kb-prod-har-001 ~]$ cat /etc/security/limits.d/90-nproc.conf<o:p></o:p></span></p>
<p class="MsoNormal"><span lang="EN-US" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D;mso-fareast-language:EN-US"># Default limit for number of user's processes to prevent<o:p></o:p></span></p>
<p class="MsoNormal"><span lang="EN-US" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D;mso-fareast-language:EN-US"># accidental fork bombs.<o:p></o:p></span></p>
<p class="MsoNormal"><span lang="EN-US" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D;mso-fareast-language:EN-US"># See rhbz #432903 for reasoning.<o:p></o:p></span></p>
<p class="MsoNormal"><span lang="EN-US" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D;mso-fareast-language:EN-US"><o:p> </o:p></span></p>
<p class="MsoNormal"><span lang="EN-US" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D;mso-fareast-language:EN-US">*          soft    nproc     40000<o:p></o:p></span></p>
<p class="MsoNormal"><span lang="EN-US" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D;mso-fareast-language:EN-US">root       soft    nproc     unlimited<o:p></o:p></span></p>
<p class="MsoNormal"><span lang="EN-US" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D;mso-fareast-language:EN-US"><o:p> </o:p></span></p>
<p class="MsoNormal" style="mso-margin-top-alt:6.0pt;margin-right:0cm;margin-bottom:6.0pt;margin-left:0cm;background:white">
<span lang="EN-US" style="font-size:10.5pt;font-family:"Arial",sans-serif;color:#222222">/etc/security/limits.conf<o:p></o:p></span></p>
<div style="border:solid #EAECF0 1.0pt;padding:12.0pt 12.0pt 12.0pt 12.0pt">
<p class="MsoNormal" style="line-height:15.6pt;background:#F8F9FA"><span lang="EN-US" style="font-size:10.5pt;font-family:"Courier New",serif;color:black">prod             soft    nofile          20000<o:p></o:p></span></p>
<p class="MsoNormal" style="line-height:15.6pt;background:#F8F9FA"><span lang="DA" style="font-size:10.5pt;font-family:"Courier New",serif;color:black">prod             hard    nofile          20000<o:p></o:p></span></p>
</div>
<p class="MsoNormal"><span lang="EN-US" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D;mso-fareast-language:EN-US"><o:p> </o:p></span></p>
<p class="MsoNormal"><span lang="EN-US" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D;mso-fareast-language:EN-US">the proftpd server on each harvester has no session limit and is niced between -10  and -20<o:p></o:p></span></p>
<p class="MsoNormal"><span lang="EN-US" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D;mso-fareast-language:EN-US"><o:p> </o:p></span></p>
<p class="MsoNormal"><span lang="EN-US" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D;mso-fareast-language:EN-US">We don’t have any problems with the scheduling of selective jobs during broad crawl job scheduling. In DK it takes about
 6-9 hours to schedule about 350 jobs I step 2.<o:p></o:p></span></p>
<p class="MsoNormal"><span lang="EN-US" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D;mso-fareast-language:EN-US">Previously – for some years ago we had the problem too, that there were no selective jobs scheduled during braodcrawl
 job scheduling, but not anymore.<o:p></o:p></span></p>
<p class="MsoNormal"><span lang="EN-US" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D;mso-fareast-language:EN-US"><o:p> </o:p></span></p>
<p class="MsoNormal"><span lang="EN-US" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D;mso-fareast-language:EN-US">The setup of the broad crawl job scheduling in DK is delayed with long HarvestJobManager timeouts, becaused of the previous
 scheduling issues. <o:p></o:p></span></p>
<p class="MsoNormal"><span lang="EN-US" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D;mso-fareast-language:EN-US">Even though we still have submitted on the queue - it is only temporary and not a production issue any longer.<o:p></o:p></span></p>
<p class="MsoNormal"><span lang="EN-US" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D;mso-fareast-language:EN-US">The job scheduling are delayed so much that it can take  a couple of hours to get all harvesters running with jobs. Most
 of the time I have 5-10 just are just “sleeping”, even though there are a lot of jobs in the “new” queue”.
<o:p></o:p></span></p>
<p class="MsoNormal"><span lang="EN-US" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D;mso-fareast-language:EN-US"><o:p> </o:p></span></p>
<p class="MsoNormal"><span lang="EN-US" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D;mso-fareast-language:EN-US">Sometimes I can provoke the some of the waiting harvesters to take a job by restarting another “listening” harvester.
  <o:p></o:p></span></p>
<p class="MsoNormal"><span lang="EN-US" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D;mso-fareast-language:EN-US"><o:p> </o:p></span></p>
<p class="MsoNormal"><span lang="EN-US" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D;mso-fareast-language:EN-US">Best regards<o:p></o:p></span></p>
<p class="MsoNormal"><span lang="EN-US" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D;mso-fareast-language:EN-US"><o:p> </o:p></span></p>
<p class="MsoNormal"><span lang="EN-US" style="font-size:11.0pt;font-family:"Calibri",sans-serif;color:#1F497D;mso-fareast-language:EN-US">Tue<o:p></o:p></span></p>
<p class="MsoNormal"><b><span lang="EN-US" style="font-size:11.0pt;font-family:"Calibri",sans-serif">From:</span></b><span lang="EN-US" style="font-size:11.0pt;font-family:"Calibri",sans-serif"> NetarchiveSuite-users <<a href="mailto:netarchivesuite-users-bounces@ml.sbforge.org">netarchivesuite-user<span lang="DA">s-bounces@ml.sbforge.org</span></a></span><span lang="DA" style="font-size:11.0pt;font-family:"Calibri",sans-serif">>
<b>On Behalf Of </b><a href="mailto:sara.aubry@bnf.fr">sara.aubry@bnf.fr</a><br>
<b>Sent:</b> Friday, September 13, 2019 6:59 PM<br>
<b>To:</b> <a href="mailto:netarchivesuite-users@ml.sbforge.org">netarchivesuite-users@ml.sbforge.org</a><br>
<b>Subject:</b> Re: [Netarchivesuite-users] NAS broad crawl questions<o:p></o:p></span></p>
<p class="MsoNormal"><span lang="DA"><o:p> </o:p></span></p>
<p class="MsoNormal"><span lang="DA" style="font-size:10.0pt;font-family:"Arial",sans-serif">Regarding the limit, if you talked to Tue about crawler configuration, then you're probably ok.</span><span lang="DA"><br>
</span><span lang="DA" style="font-size:10.0pt;font-family:"Arial",sans-serif">Generation of all snaphot jobs takes over the generation of selective ones.</span><span lang="DA"><br>
</span><span lang="DA" style="font-size:10.0pt;font-family:"Arial",sans-serif">At BnF, before we launch the broad crawl, we make sure our daily crawls have started because the whole generation for about 1000 jobs takes between 4 and 5 hours.</span><span lang="DA"><br>
</span><span lang="DA" style="font-size:10.0pt;font-family:"Arial",sans-serif">If you do have an available snaphot harvest controller truly available (with no grey dot), then the second job should start.
</span><span lang="DA"><br>
</span><span lang="DA" style="font-size:10.0pt;font-family:"Arial",sans-serif">Common problems (at least some we encountered) are:</span><span lang="DA"><br>
</span><span lang="DA" style="font-size:10.0pt;font-family:"Arial",sans-serif">- acces problem to the arc repository
</span><span lang="DA"><br>
</span><span lang="DA" style="font-size:10.0pt;font-family:"Arial",sans-serif">- unwanted characters in seed lists causing the desactivation of the harvest definition</span><span lang="DA"><br>
</span><span lang="DA" style="font-size:10.0pt;font-family:"Arial",sans-serif">- broker out of memory</span><span lang="DA"><br>
<br>
</span><span lang="DA" style="font-size:10.0pt;font-family:"Arial",sans-serif">Sara</span><span lang="DA"><br>
<br>
<br>
<br>
<br>
</span><span lang="DA" style="font-size:7.5pt;font-family:"Arial",sans-serif;color:#5F5F5F">De :        </span><span lang="DA" style="font-size:7.5pt;font-family:"Arial",sans-serif">"Peter Svanberg" <<a href="mailto:Peter.Svanberg@kb.se">Peter.Svanberg@kb.se</a>></span><span lang="DA"><br>
</span><span lang="DA" style="font-size:7.5pt;font-family:"Arial",sans-serif;color:#5F5F5F">A :        </span><span lang="DA" style="font-size:7.5pt;font-family:"Arial",sans-serif">"<a href="mailto:netarchivesuite-users@ml.sbforge.org">netarchivesuite-users@ml.sbforge.org</a>"
 <<a href="mailto:netarchivesuite-users@ml.sbforge.org">netarchivesuite-users@ml.sbforge.org</a>></span><span lang="DA"><br>
</span><span lang="DA" style="font-size:7.5pt;font-family:"Arial",sans-serif;color:#5F5F5F">Date :        </span><span lang="DA" style="font-size:7.5pt;font-family:"Arial",sans-serif">13/09/2019 18:44</span><span lang="DA"><br>
</span><span lang="DA" style="font-size:7.5pt;font-family:"Arial",sans-serif;color:#5F5F5F">Objet :        </span><span lang="DA" style="font-size:7.5pt;font-family:"Arial",sans-serif">Re: [Netarchivesuite-users] NAS broad crawl questions</span><span lang="DA"><br>
</span><span lang="DA" style="font-size:7.5pt;font-family:"Arial",sans-serif;color:#5F5F5F">Envoyé par :        </span><span lang="DA" style="font-size:7.5pt;font-family:"Arial",sans-serif">"NetarchiveSuite-users" <<a href="mailto:netarchivesuite-users-bounces@ml.sbforge.org">netarchivesuite-users-bounces@ml.sbforge.org</a>></span><span lang="DA"><o:p></o:p></span></p>
<div class="MsoNormal" align="center" style="text-align:center"><span lang="DA">
<hr size="2" width="100%" noshade="" style="color:#A0A0A0" align="center">
</span></div>
<p class="MsoNormal"><span lang="DA"><br>
<br>
<br>
10000 is what the default limits give. Should we change that?<br>
<br>
One job started and ended but next snapshot job didn’t start. That’s what is strange.<br>
<br>
Then later no selected job is started either. Everything seems to have stopped/paused, except snapshot job creation.<br>
<br>
We will dig further in logs etc.<br>
<br>
/Peter<br>
<br>
13 sep. 2019 kl. 18:15 skrev "<a href="mailto:sara.aubry@bnf.fr">sara.aubry@bnf.fr</a>" <<a href="mailto:sara.aubry@bnf.fr">sara.aubry@bnf.fr</a>>:<br>
<br>
</span><span lang="DA" style="font-size:10.0pt;font-family:"Arial",sans-serif">Hello Peter,</span><span lang="DA"><br>
</span><span lang="DA" style="font-size:10.0pt;font-family:"Arial",sans-serif"><br>
That's great news, just the start of a big aventure!<br>
About everything should happen during the first broad crawl!</span><span lang="DA"><br>
</span><span lang="DA" style="font-size:10.0pt;font-family:"Arial",sans-serif"><br>
10 000 domains per job is quite big, we do only 5 000 but you probably have big crawlers.</span><span lang="DA"><br>
</span><span lang="DA" style="font-size:10.0pt;font-family:"Arial",sans-serif"><br>
If you only had a single crawler started on the Snaphsot channel, that's normal that only one job started.<br>
That's very cautious. We also do this to make sure that we don't fail about 1000 jobs in a row...</span><span lang="DA"><br>
</span><span lang="DA" style="font-size:10.0pt;font-family:"Arial",sans-serif"><br>
Grey dot with no hostname means that your job is over and being post-processed with data transferred to the arc repository.<br>
To check on this, look at the end of your HarvesController log file.<br>
If everything went well, you can start another crawler, see if you are crawling well, and then launch your other crawlers.</span><span lang="DA"><br>
</span><span lang="DA" style="font-size:10.0pt;font-family:"Arial",sans-serif"><br>
Job generation can be quite long.</span><span lang="DA"><br>
</span><span lang="DA" style="font-size:10.0pt;font-family:"Arial",sans-serif"><br>
Best,<br>
<br>
Sara</span><span lang="DA"><br>
<br>
<br>
<br>
<br>
</span><span lang="DA" style="font-size:7.5pt;font-family:"Arial",sans-serif;color:#5F5F5F"><br>
De :        </span><span lang="DA" style="font-size:7.5pt;font-family:"Arial",sans-serif">"Peter Svanberg" <</span><span lang="DA"><a href="mailto:Peter.Svanberg@kb.se"><span style="font-size:7.5pt;font-family:"Arial",sans-serif">Peter.Svanberg@kb.se</span></a></span><span lang="DA" style="font-size:7.5pt;font-family:"Arial",sans-serif">><span style="color:#5F5F5F"><br>
A :        </span>"</span><span lang="DA"><a href="mailto:netarchivesuite-users@ml.sbforge.org"><span style="font-size:7.5pt;font-family:"Arial",sans-serif">netarchivesuite-users@ml.sbforge.org</span></a></span><span lang="DA" style="font-size:7.5pt;font-family:"Arial",sans-serif">"
 <</span><span lang="DA"><a href="mailto:netarchivesuite-users@ml.sbforge.org"><span style="font-size:7.5pt;font-family:"Arial",sans-serif">netarchivesuite-users@ml.sbforge.org</span></a></span><span lang="DA" style="font-size:7.5pt;font-family:"Arial",sans-serif">><span style="color:#5F5F5F"><br>
Date :        </span>13/09/2019 18:03<span style="color:#5F5F5F"><br>
Objet :        </span>[Netarchivesuite-users] NAS broad crawl questions<span style="color:#5F5F5F"><br>
Envoyé par :        </span>"NetarchiveSuite-users" <</span><span lang="DA"><a href="mailto:netarchivesuite-users-bounces@ml.sbforge.org"><span style="font-size:7.5pt;font-family:"Arial",sans-serif">netarchivesuite-users-bounces@ml.sbforge.org</span></a></span><span lang="DA" style="font-size:7.5pt;font-family:"Arial",sans-serif">></span><span lang="DA"><o:p></o:p></span></p>
<div class="MsoNormal" align="center" style="text-align:center"><span lang="DA">
<hr size="2" width="100%" noshade="" style="color:#A0A0A0" align="center">
</span></div>
<p class="MsoNormal" style="margin-bottom:12.0pt"><span lang="DA"><br>
<br>
</span><span lang="DA" style="font-family:"Calibri",sans-serif"><br>
This Wednesday at 11:02 we started our first NAS broad crawl, tadaa! (Pär has pictures showing Thomas and I pressing the mouse button, clicking on “Activate”.)<br>
<br>
It started well, with the job creation process. The first job, though, contained only one domain – maybe because it was special, with lots of non-default seeds. Then there was job two, containing 9999 domains, and then the process continued, with 10000 domains
 in each job.<br>
<br>
After that, the first snapshot job started running. But after it was finished, no more snapshot jobs was started.<br>
<br>
Later, our selective harvests started and run as scheduled. But when they were finished, nothing seems to happen in the job finishing and job starting area. The “All Running Jobs” page just contains job rows with a grey dot (crawl finished) and no host name.
 But the job creation process continues, with now soon 100 jobs with 10000 domains each.<br>
<br>
1)     Do you have any hints on what could have happened? Is the admin host so occupied with job creation that it can’t handle anything else? But it wasn’t during the first hours. Where could we look to find out what could be wrong? (In log files, of course,
 but what should we look for?)<br>
<br>
We will let the job creation be finished (which will happen approximately Sunday after 18) and see what then happens.<br>
<br>
Then, concerning starting a broad crawl:<br>
<br>
2)     We were advised to just have one harvester process running when the snapshot harvest is activated, which we did. But when could more processes be started? After the first snapshot job is started? Or should we wait until all jobs are created?<br>
<br>
Regards,<br>
</span><span lang="DA" style="font-family:"Arial",sans-serif"><br>
-----<br>
<br>
Peter Svanberg<br>
Technical officer<br>
Digital Collections Department, Newspapers, Radio and Television Division<br>
<br>
National Library of Sweden<br>
PO Box 5039 <br>
SE-104 51 Stockholm<br>
Visits: Karlavägen 100, Stockholm <br>
Phone: +46 10 709 32 78<br>
<br>
E-mail</span><span lang="DA" style="font-family:"Calibri",sans-serif">: </span><span lang="DA"><a href="mailto:peter.svanberg@kb.se"><span style="font-family:"Arial",sans-serif">peter.svanberg@kb.se</span></a></span><span lang="DA" style="font-family:"Arial",sans-serif"><br>
Web</span><span lang="DA" style="font-family:"Calibri",sans-serif">: </span><span lang="DA"><a href="www.kb.se"><span style="font-family:"Arial",sans-serif">www.kb.se</span></a><br>
</span><span lang="DA" style="font-family:"Calibri",sans-serif"><br>
<br>
</span><tt><span lang="DA" style="font-size:10.0pt">_______________________________________________</span></tt><span lang="DA" style="font-size:10.0pt;font-family:"Courier New",serif"><br>
<tt>NetarchiveSuite-users mailing list</tt><u><span style="color:blue"><br>
</span></u></span><span lang="DA"><a href="mailto:NetarchiveSuite-users@ml.sbforge.org"><tt><span style="font-size:10.0pt">NetarchiveSuite-users@ml.sbforge.org</span></tt></a><u><span style="color:blue"><br>
</span></u><a href="https://ml.sbforge.org/mailman/listinfo/netarchivesuite-users"><tt><span style="font-size:10.0pt">https://ml.sbforge.org/mailman/listinfo/netarchivesuite-users</span></tt></a><o:p></o:p></span></p>
<div class="MsoNormal" align="center" style="text-align:center"><span lang="DA">
<hr size="2" width="100%" align="center">
</span></div>
<p><span lang="DA"><a href="https://www.bnf.fr/fr/actualites/journees-europeennes-du-patrimoine-2019"><b><i><span style="font-family:"Arial",sans-serif">Journées européennes du patrimoine 2019</span></i></b></a></span><span lang="DA" style="font-family:"Arial",sans-serif">-
 Samedi 21 et dimanche 22 septembre sur les sites de la BnF</span><span lang="DA"><o:p></o:p></span></p>
<p><b><span lang="DA" style="font-family:"Arial",sans-serif;color:green">Avant d'imprimer, pensez à l'environnement.</span></b><span lang="DA"><o:p></o:p></span></p>
<p><span lang="DA">_______________________________________________<br>
NetarchiveSuite-users mailing list<u><span style="color:blue"><br>
</span></u><a href="mailto:NetarchiveSuite-users@ml.sbforge.org">NetarchiveSuite-users@ml.sbforge.org</a><u><span style="color:blue"><br>
</span></u><a href="https://ml.sbforge.org/mailman/listinfo/netarchivesuite-users">https://ml.sbforge.org/mailman/listinfo/netarchivesuite-users</a></span><tt><span lang="DA" style="font-size:10.0pt">_______________________________________________</span></tt><span lang="DA" style="font-size:10.0pt;font-family:"Courier New",serif"><br>
<tt>NetarchiveSuite-users mailing list</tt><br>
<tt><a href="mailto:NetarchiveSuite-users@ml.sbforge.org">NetarchiveSuite-users@ml.sbforge.org</a></tt><br>
</span><span lang="DA"><a href="https://ml.sbforge.org/mailman/listinfo/netarchivesuite-users"><tt><span style="font-size:10.0pt">https://ml.sbforge.org/mailman/listinfo/netarchivesuite-users</span></tt></a><o:p></o:p></span></p>
<div class="MsoNormal" align="center" style="text-align:center"><span lang="DA" style="font-family:"Arial",sans-serif">
<hr size="2" width="100%" align="center">
</span></div>
<p><em><b><span lang="DA" style="font-family:"Arial",sans-serif"><a href="https://www.bnf.fr/fr/actualites/journees-europeennes-du-patrimoine-2019">Journées européennes du patrimoine 2019</a></span></b></em><span lang="DA" style="font-family:"Arial",sans-serif">
 - Samedi 21 et dimanche 22 septembre sur les sites de la BnF<o:p></o:p></span></p>
<p><strong><span lang="DA" style="font-family:"Arial",sans-serif;color:green">Avant d'imprimer, pensez à l'environnement.</span></strong><span lang="DA" style="font-family:"Arial",sans-serif;color:green"><o:p></o:p></span></p>
</div>
</body>
</html>