# grep -B 1 "hogehoge" ./higehige.txt後の行も表示
# grep -A 1 "hogehoge" ./higehige.txt
# grep -B 1 "hogehoge" ./higehige.txt後の行も表示
# grep -A 1 "hogehoge" ./higehige.txt
# wget http://sourceforge.net/projects/mysql-python/files/latest/download # tar xvf MySQL-python-1.2.4b4.tar.gz # python setup.py build # python setup.py installサンプルプログラム。
!/usr/bin/env python # coding: utf-8 import MySQLdb con = MySQLdb.connect(db='nutch', host='localhost', user='nutch', passwd='password') cur = con.cursor() q = 'SELECT id, title FROM webpage LIMIT 10' cur.execute(q) rows = cur.fetchall() for row in rows: print "%s ( %s )" % (row[1], row[0]) cur.close() con.close()実行結果。
# python myMySQLforPython.py Welcome to Apache Nutch ( org.apache.nutch:http/ ) About Apache Nutch ( org.apache.nutch:http/about.html ) None ( org.apache.nutch:http/about.pdf ) All Classes (apache-nutch 1.6 API) ( org.apache.nutch:http/apidocs-1.6/allclasses-frame.html ) apache-nutch 1.6 API ( org.apache.nutch:http/apidocs-1.6/index.html ) None ( org.apache.nutch:http/apidocs-1.6/org/apache/nutch/analysis/lang/HTMLLanguageParser.html ) None ( org.apache.nutch:http/apidocs-1.6/org/apache/nutch/analysis/lang/LanguageIndexingFilter.html ) None ( org.apache.nutch:http/apidocs-1.6/org/apache/nutch/analysis/lang/package-frame.html ) None ( org.apache.nutch:http/apidocs-1.6/org/apache/nutch/collection/CollectionManager.html ) None ( org.apache.nutch:http/apidocs-1.6/org/apache/nutch/collection/package-frame.html )
# mysql -u root -p Enter password: Welcome to the MySQL monitor. Commands end with ; or \g. Your MySQL connection id is 138 Server version: 5.1.61 Source distribution Copyright (c) 2000, 2011, Oracle and/or its affiliates. All rights reserved. Oracle is a registered trademark of Oracle Corporation and/or its affiliates. Other names may be trademarks of their respective owners. Type 'help;' or '\h' for help. Type '\c' to clear the current input statement. mysql> mysql> use mysql; Reading table information for completion of table and column names You can turn off this feature to get a quicker startup with -A Database changed mysql> mysql> GRANT ALL PRIVILEGES ON nutch.* TO nutch@localhost IDENTIFIED BY '****'; Query OK, 0 rows affected (0.00 sec) mysql> mysql> quit Bye
# cd /opt # ls apache-nutch-2.1-src.tar.gz # tar xvf apache-nutch-2.1-src.tar.gz # cd apache-nutch-2.1
<property> <name>storage.data.store.class</name> <value>org.apache.gora.sql.store.SqlStore</value> </property>
gora.datastore.default=org.apache.gora.sql.store.SqlStore gora.sqlstore.jdbc.driver=com.mysql.jdbc.Driver gora.sqlstore.jdbc.url=jdbc:mysql://localhost:3306/nutch?createDatabaseIfNotExist=true gora.sqlstore.jdbc.user=<作成したユーザ名(ここではnutch)> gora.sqlstore.jdbc.password=<設定したパスワード>
<dependency org="mysql" name="mysql-connector-java" rev="5.1.18" conf="*->default"/>
# ant <省略>
<property> <name>http.agent.name</name> <value>My Nutch Spider</value> </property>
# mkdir urlsurls/seed.txt
#+. +^http://([a-z0-9]*\.)*nutch.apache.org
# ./runtime/local/bin/nutch inject urls/ InjectorJob: starting InjectorJob: urlDir: urls 2013/01/27 12:56:49 org.apache.gora.sql.store.SqlStore createSchema 情報: creating schema: webpage InjectorJob: org.apache.gora.util.GoraException: java.io.IOException: com.mysql.jdbc.exceptions.jdbc4.MySQLSyntaxErrorException: Column length too big for column 'text' (max = 21845); use BLOB or TEXT instead at org.apache.gora.store.DataStoreFactory.createDataStore(DataStoreFactory.java:167) at org.apache.gora.store.DataStoreFactory.createDataStore(DataStoreFactory.java:135) at org.apache.nutch.storage.StorageUtils.createWebStore(StorageUtils.java:75) at org.apache.nutch.crawl.InjectorJob.run(InjectorJob.java:214) at org.apache.nutch.crawl.InjectorJob.inject(InjectorJob.java:228) at org.apache.nutch.crawl.InjectorJob.run(InjectorJob.java:248) at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:65) at org.apache.nutch.crawl.InjectorJob.main(InjectorJob.java:258) Caused by: java.io.IOException: com.mysql.jdbc.exceptions.jdbc4.MySQLSyntaxErrorException: Column length too big for column 'text' (max = 21845); use BLOB or TEXT instead at org.apache.gora.sql.store.SqlStore.createSchema(SqlStore.java:226) at org.apache.gora.sql.store.SqlStore.initialize(SqlStore.java:172) at org.apache.gora.store.DataStoreFactory.initializeDataStore(DataStoreFactory.java:102) at org.apache.gora.store.DataStoreFactory.createDataStore(DataStoreFactory.java:161) ... 7 more Caused by: com.mysql.jdbc.exceptions.jdbc4.MySQLSyntaxErrorException: Column length too big for column 'text' (max = 21845); use BLOB or TEXT instead at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method) at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:39) at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:27) at java.lang.reflect.Constructor.newInstance(Constructor.java:513) at com.mysql.jdbc.Util.handleNewInstance(Util.java:411) at com.mysql.jdbc.Util.getInstance(Util.java:386) at com.mysql.jdbc.SQLError.createSQLException(SQLError.java:1052) at com.mysql.jdbc.MysqlIO.checkErrorPacket(MysqlIO.java:3609) at com.mysql.jdbc.MysqlIO.checkErrorPacket(MysqlIO.java:3541) at com.mysql.jdbc.MysqlIO.sendCommand(MysqlIO.java:2002) at com.mysql.jdbc.MysqlIO.sqlQueryDirect(MysqlIO.java:2163) at com.mysql.jdbc.ConnectionImpl.execSQL(ConnectionImpl.java:2624) at com.mysql.jdbc.PreparedStatement.executeInternal(PreparedStatement.java:2127) at com.mysql.jdbc.PreparedStatement.executeUpdate(PreparedStatement.java:2427) at com.mysql.jdbc.PreparedStatement.executeUpdate(PreparedStatement.java:2345) at com.mysql.jdbc.PreparedStatement.executeUpdate(PreparedStatement.java:2330) at org.apache.gora.sql.store.SqlStore.createSchema(SqlStore.java:224) ... 10 moreエラーがでました。textカラム(Webの本文が格納される)のサイズが少ないと、型をBLOBかTEXTにせよと。
mysql> drop database nutch; Query OK, 0 rows affected (0.00 sec) mysql> CREATE DATABASE nutch DEFAULT CHARACTER SET utf8 DEFAULT COLLATE utf8_unicode_ci; Query OK, 1 row affected (0.00 sec) mysql> use nutch; Database changed mysql> CREATE TABLE `webpage` ( -> `id` varchar(255) NOT NULL, -> `headers` blob, -> `text` mediumtext DEFAULT NULL, -> `status` int(11) DEFAULT NULL, -> `markers` blob, -> `parseStatus` blob, -> `modifiedTime` bigint(20) DEFAULT NULL, -> `score` float DEFAULT NULL, -> `typ` varchar(32) CHARACTER SET latin1 DEFAULT NULL, -> `baseUrl` varchar(767) DEFAULT NULL, -> `content` longblob, -> `title` varchar(2048) DEFAULT NULL, -> `reprUrl` varchar(767) DEFAULT NULL, -> `fetchInterval` int(11) DEFAULT NULL, -> `prevFetchTime` bigint(20) DEFAULT NULL, -> `inlinks` mediumblob, -> `prevSignature` blob, -> `outlinks` mediumblob, -> `fetchTime` bigint(20) DEFAULT NULL, -> `retriesSinceFetch` int(11) DEFAULT NULL, -> `protocolStatus` blob, -> `signature` blob, -> `metadata` blob, -> PRIMARY KEY (`id`) -> ) ENGINE=InnoDB -> ROW_FORMAT=COMPRESSED -> DEFAULT CHARSET=utf8; Query OK, 0 rows affected (0.00 sec) mysql> show tables; +-----------------+ | Tables_in_nutch | +-----------------+ | webpage | +-----------------+ 1 row in set (0.00 sec)gora-sql-mapping.xmlのidカラムの長さを512から767に変更する。
./runtime/local/bin/nutch crawl urls/ -depth 1 -topN 1MySQLで確認。(ずれていてみにくいですが。)
mysql> select id, score, fetchTime from webpage; +----------------------------------------------+-----------+---------------+ | id | score | fetchTime | +----------------------------------------------+-----------+---------------+ | org.apache.nutch:http/ | 1.05882 | 1361854115396 | | org.apache.nutch:http/about.html | 0.0588235 | 1359262135084 | | org.apache.nutch:http/apidocs-1.6/index.html | 0.0588235 | 1359262135086 | | org.apache.nutch:http/apidocs-2.1/index.html | 0.0588235 | 1359262135086 | | org.apache.nutch:http/bot.html | 0.0588235 | 1359262135087 | | org.apache.nutch:http/credits.html | 0.0588235 | 1359262135088 | | org.apache.nutch:http/faq.html | 0.0588235 | 1359262135089 | | org.apache.nutch:http/index.html | 0.0588235 | 1359262135090 | | org.apache.nutch:http/index.pdf | 0.0588235 | 1359262135092 | | org.apache.nutch:http/issue_tracking.html | 0.0588235 | 1359262135093 | | org.apache.nutch:http/mailing_lists.html | 0.0588235 | 1359262135094 | | org.apache.nutch:http/nightly.html | 0.0588235 | 1359262135095 | | org.apache.nutch:http/old_downloads.html | 0.0588235 | 1359262135095 | | org.apache.nutch:http/sonar.html | 0.0588235 | 1359262135096 | | org.apache.nutch:http/tutorial.html | 0.0588235 | 1359262135097 | | org.apache.nutch:http/version_control.html | 0.0588235 | 1359262135099 | | org.apache.nutch:http/wiki.html | 0.0588235 | 1359262135100 | +----------------------------------------------+-----------+---------------+ 17 rows in set (0.00 sec)
# pwd /opt/apache-nutch-2.1 # cat urls/seed.txt http://nutch.apache.org/ http://lucene.apache.org/ http://cassandra.apache.org/ # nutch inject urls/ InjectorJob: starting InjectorJob: urlDir: urls <省略> InjectorJob: finished確かめる。
# nutch readdb -dump ./out_dir # cat out_dir/part-r-00000 http://cassandra.apache.org/ key: org.apache.cassandra:http/ baseUrl: null status: 0 (null) fetchInterval: 2592000 fetchTime: 1358584769397 prevFetchTime: 0 retries: 0 modifiedTime: 0 protocolStatus: (null) parseStatus: (null) title: null score: 1.0 markers: {dist=0, _injmrk_=y} metadata _csh_ : ?� http://lucene.apache.org/ key: org.apache.lucene:http/ baseUrl: null status: 0 (null) fetchInterval: 2592000 fetchTime: 1358584769397 prevFetchTime: 0 retries: 0 modifiedTime: 0 protocolStatus: (null) parseStatus: (null) title: null score: 1.0 markers: {dist=0, _injmrk_=y} metadata _csh_ : ?� http://nutch.apache.org/ key: org.apache.nutch:http/ baseUrl: null status: 0 (null) fetchInterval: 2592000 fetchTime: 1358584769397 prevFetchTime: 0 retries: 0 modifiedTime: 0 protocolStatus: (null) parseStatus: (null) title: null score: 1.0 markers: {dist=0, _injmrk_=y} metadata _csh_ : ?�たしかに3つのサイトが登録されている。
# nutch generate <省略>そして確認。readdb -dumpの出力先ディレクトリは、存在するとエラーになる。ここでは同じディレクトリにするので最初に消しておく。
# rm -rf out_dir # nutch readdb -dump ./out_dir <省略> # cat out_dir/part-r-00000 http://cassandra.apache.org/ key: org.apache.cassandra:http/ baseUrl: null status: 0 (null) fetchInterval: 2592000 fetchTime: 1358584769397 prevFetchTime: 0 retries: 0 modifiedTime: 0 protocolStatus: (null) parseStatus: (null) title: null score: 1.0 markers: {dist=0, _injmrk_=y, _gnmrk_=1358585856-208596666} metadata _csh_ : ?� http://lucene.apache.org/ key: org.apache.lucene:http/ baseUrl: null status: 0 (null) fetchInterval: 2592000 fetchTime: 1358584769397 prevFetchTime: 0 retries: 0 modifiedTime: 0 protocolStatus: (null) parseStatus: (null) title: null score: 1.0 markers: {dist=0, _injmrk_=y, _gnmrk_=1358585856-208596666} metadata _csh_ : ?� http://nutch.apache.org/ key: org.apache.nutch:http/ baseUrl: null status: 0 (null) fetchInterval: 2592000 fetchTime: 1358584769397 prevFetchTime: 0 retries: 0 modifiedTime: 0 protocolStatus: (null) parseStatus: (null) title: null score: 1.0 markers: {dist=0, _injmrk_=y, _gnmrk_=1358585856-208596666} metadata _csh_ : ?�markersのところが追記されているが、大きな変化はなし。
# nutch generate -topN 1 <省略> # nutch readdb -dump out_dir # cat out_dir/part-r-00000 http://cassandra.apache.org/ key: org.apache.cassandra:http/ <省略> markers: {dist=0, _injmrk_=y, _gnmrk_=1358586890-1155183335} metadata _csh_ : ?� http://lucene.apache.org/ key: org.apache.lucene:http/ <省略> markers: {dist=0, _injmrk_=y} metadata _csh_ : ?� http://nutch.apache.org/ key: org.apache.nutch:http/ <省略> markers: {dist=0, _injmrk_=y} metadata _csh_ : ?�最初のサイトに対してのみ、markersに追記がなされている。
# nutch fetch 1358585856-208596666 <省略>Cassandraをのぞくと、確かにコンテンツが格納されている模様。
# cassandra-cli Connected to: "Cassandra Cluster" on Welcome to Cassandra CLI version 1.2.0 Type 'help;' or '?' for help. Type 'quit;' or 'exit;' to quit. [default@unknown] use webpage; [default@webpage] list f; Using default limit of 100 Using default column limit of 100 ------------------- RowKey: 6f72672e6170616368652e6e757463683a687474702f => (column=bas, value=http://nutch.apache.org/, timestamp=1358588578114000) => (column=cnt, value= <省略> , timestamp=1358588578191000) => (column=fi, value=2592000, timestamp=1358588024031000) => (column=pts, value=1358588021620, timestamp=1358588578188001) => (column=s, value=1.0, timestamp=1358588024032000) => (column=st, value=2, timestamp=1358588578187000) => (column=ts, value=1358588574261, timestamp=1358588578188000) => (column=typ, value=application/xhtml+xml, timestamp=1358588578192000) 3 Rows Returned. Elapsed time: 27 msec(s). [default@webpage]
# nutch parse 1358588033-175013002 <省略>CassandraのColumnFamily「p」に解析されたデータが格納される。
[default@webpage] list sc; Using default limit of 100 Using default column limit of 100 ------------------- RowKey: 6f72672e6170616368652e6e757463683a687474702f => (super_column=h, (column=Accept-Ranges, value=bytes, timestamp=1358588578175000) (column=Connection, value=close, timestamp=1358588578168000) (column=Content-Encoding, value=gzip, timestamp=1358588578162000) (column=Content-Length, value=8631, timestamp=1358588578171000) (column=Content-Type, value=text/html; charset=utf-8, timestamp=1358588578169000) (column=Date, value=Sat, 19 Jan 2013 09:42:53 GMT, timestamp=1358588578173000) (column=ETag, value="84c4-4d0994769b476-gzip", timestamp=1358588578166000) (column=Last-Modified, value=Tue, 11 Dec 2012 20:10:53 GMT, timestamp=1358588578172000) (column=Server, value=Apache/2.4.3 (Unix) OpenSSL/1.0.0g, timestamp=1358588578176000) (column=Vary, value=Accept-Encoding, timestamp=1358588578178000)) => (super_column=mk, (column=__prsmrk__, value=1358588033-175013002, timestamp=1358590210616000) (column=_ftcmrk_, value=1358588033-175013002, timestamp=1358590210613000) (column=_gnmrk_, value=1358588033-175013002, timestamp=1358590210614000) (column=_injmrk_, value=y, timestamp=1358590210611000) (column=dist, value=0, timestamp=1358590210610000)) => (super_column=mtdt, (column=_csh_, value=, timestamp=1358590006461000)) => (super_column=ol, (column=http://lucene.apache.org/java/, value=Lucene, timestamp=1358590210557000) (column=http://lucene.apache.org/solr/, value=Solr, timestamp=1358590210581000) (column=http://nutch.apache.org/, value=Nutch, timestamp=1358590210606000) <省略> developers and community members hang out in the #cassandra channel on irc.freenode.net . If you are new to IRC, you can use a web-based client . Dead Trees Cassandra High Performance Cookbook , by Ed Capriolo. Covers Cassandra 0.8. Also on Amazon . Copyright © 2009 The Apache Software Foundation . Licensed under the Apache License, Version 2.0. Apache and the Apache feather logo are trademarks of The Apache Software Foundation. Privacy Policy ., timestamp=1358590210680000) => (column=sig, value=�#)�2s��D��j�, timestamp=1358590210678000) => (column=t, value=The Apache Cassandra Project, timestamp=1358590210679000) 3 Rows Returned. Elapsed time: 13 msec(s).
# nutch updatedbCassandraをのぞくとデータ件数が45に増えている。
[default@webpage] list f; <省略&pt; => (column=ts, value=1358590521209, timestamp=1358590521282000) 45 Rows Returned. Elapsed time: 80 msec(s).readdbでも確認してみる。
# cat ./out_dir/part-r-00000 http://cassandra.apache.org/ key: org.apache.cassandra:http/ baseUrl: http://cassandra.apache.org/ status: 2 (status_fetched) fetchInterval: 2592000 fetchTime: 1363772574261 prevFetchTime: 1358588021620 retries: 0 modifiedTime: 0 protocolStatus: SUCCESS, args=[] parseStatus: success/ok (1/0), args=[] title: The Apache Cassandra Project score: 1.0 signature: 14efbfbd2329efbfbd3273efbfbdefbfbd44efbfbdefbfbd6aefbfbd000000 markers: {dist=0, _injmrk_=y, _updmrk_=1358588033-175013002, _gnmrk_=1358588033-175013002, _ftcmrk_=1358588033-175013002, __prsmrk__=1358588033-175013002} metadata _csh_ : http://cassandra.apache.org/download/ key: org.apache.cassandra:http/download/ baseUrl: null status: 1 (status_unfetched) fetchInterval: 2592000 fetchTime: 1358590521209 prevFetchTime: 0 retries: 0 modifiedTime: 0 protocolStatus: UNKNOWN_CODE_0, args=[] parseStatus: notparsed/ok (0/0), args=[] title: null score: 0.0 markers: {dist=1} metadata _csh_ : http://cassandra.apache.org/privacy.html key: org.apache.cassandra:http/privacy.html baseUrl: null status: 1 (status_unfetched) fetchInterval: 2592000 fetchTime: 1358590521210 prevFetchTime: 0 retries: 0 modifiedTime: 0 protocolStatus: UNKNOWN_CODE_0, args=[] parseStatus: notparsed/ok (0/0), args=[] title: null score: 0.0 markers: {dist=1} metadata _csh_ : http://lucene.apache.org/ key: org.apache.lucene:http/ baseUrl: http://lucene.apache.org/ status: 2 (status_fetched) fetchInterval: 2592000 fetchTime: 1363772574561 prevFetchTime: 1358588021620 retries: 0 modifiedTime: 0 protocolStatus: SUCCESS, args=[] parseStatus: success/ok (1/0), args=[] title: Apache Lucene - Welcome to Apache Lucene score: 1.0 signature: efbfbdefbfbdefbfbdefbfbd67efbfbd74efbfbd5cefbfbdefbfbd21413befbfbdefbfbd0000000000000000000000000000000000000000000000000000000000000000000000 markers: {dist=0, _injmrk_=y, _updmrk_=1358588033-175013002, _gnmrk_=1358588033-175013002, _ftcmrk_=1358588033-175013002, __prsmrk__=1358588033-175013002} metadata _csh_ :
# nutch generate -topN 10 <省略> GeneratorJob: generated batch id: 1358607691-1967365662 # nutch fetch 1358607691-1967365662 # nutch parse 1358607691-1967365662 # nutch updatedbColumnFamily「p」が13件に、ColumnFamily「f」が125件となった。
SLF4J: Class path contains multiple SLF4J bindings. SLF4J: Found binding in [jar:file:/usr/lib/hbase/lib/slf4j-log4j12-1.5.8.jar!/org/slf4j/impl/StaticLoggerBinder.class] SLF4J: Found binding in [jar:file:/usr/lib/hadoop-0.20/lib/slf4j-log4j12-1.4.3.jar!/org/slf4j/impl/StaticLoggerBinder.class] SLF4J: Found binding in [jar:file:/opt/apache-nutch-1.6/lib/slf4j-log4j12-1.6.1.jar!/org/slf4j/impl/StaticLoggerBinder.class] SLF4J: Found binding in [jar:file:/opt/apache-cassandra-1.2.0/lib/slf4j-log4j12-1.7.2.jar!/org/slf4j/impl/StaticLoggerBinder.class] SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation. Exception in thread "main" java.lang.NoSuchMethodError: org.apache.thrift.meta_data.FieldValueMetaData.原因は、エラーログの通り、複数のself4j-log4jがクラスパスに存在するため。(BZ)V at org.apache.cassandra.thrift.ColumnParent. (ColumnParent.java:128) at NutchCassandraData. (NutchCassandraData.java:66) at NutchCassandraData.main(NutchCassandraData.java:108)
# cd /opt # ls apache-solr-4.0.0.tgz # tar xvf apache-solr-4.0.0.tgzNutchのインストールディレクトリ配下confにSolr4用の設定ファイルが用意されているので、これをSolrのインストールディレクトリの適切な場所にコピー。
# cp /opt/apache-nutch-2.1/conf/schema-solr4.xml /opt/apache-solr-4.0.0/example/solr/collection1/conf/schema.xmlが、1.6の時と同様にエラーが出て起動しない・・・。
<field name="_version_" type="long" indexed="true" stored="true" />これで起動するはず。以下で起動して、 ブラウザで「http://<ホスト名>:8983/solr/」を参照して起動を確認する。
# cd /opt/apache-solr-4.0.0/example/ # java -jar start.jar
# /opt/apache-nutch-2.1/runtime/local/bin/nutch solrindex http://localhost:8983/solr -all <省略> 2013/01/19 10:54:31 org.apache.hadoop.mapred.Counters log 情報: SPLIT_RAW_BYTES=1096 2013/01/19 10:54:31 org.apache.hadoop.mapred.Counters log 情報: Map output records=33 SolrIndexerJob: done.
# cd /opt # tar xvf apache-cassandra-1.2.0-bin.tar.gz # cd apache-cassandra-1.2.0
cluster_name: 'Cassandra Cluster'
# ./bin/cassandara
# ./bin/cassandra-cli Connected to: "Cassandra Cluster" on Welcome to Cassandra CLI version 1.2.0 Type 'help;' or '?' for help. Type 'quit;' or 'exit;' to quit. [default@unknown] [default@unknown] show cluster name; Cassandra Clusterとりあえず接続できているようなので、正常に起動している模様。
# cd /opt # ls apache-nutch-2.1-src.tar.gz # tar xvf apache-nutch-2.1-src.tar.gz # cd apache-nutch-2.1
<property> <name>storage.data.store.class</name> <value>org.apache.gora.cassandra.store.CassandraStore</value> </property>
gora.datastore.default=org.apache.gora.cassandra.store.CassandraStore gora.cassandrastore.servers=localhost:9160
<dependency org="org.apache.gora" name="gora-cassandra" rev="0.2" conf="*->default" />
# ant <省略>
<property> <name>http.agent.name</name> <value>My Nutch Spider</value> </property>
# mkdir urlsurls/seed.txt
#+. +^http://([a-z0-9]*\.)*nutch.apache.org
# nutch crawl urls -dir crawl -depth 3 -topN 20
# cassandra-cli Connected to: "Cassandra Cluster" on Welcome to Cassandra CLI version 1.2.0 Type 'help;' or '?' for help. Type 'quit;' or 'exit;' to quit. [default@unknown] use webpage; Authenticated to keyspace: webpageカラムファミリ「f」を全件参照。
[default@webpage] list f limit 1000; <省略> => (column=ts, value=1358554167986, timestamp=1358554170426001) ------------------- RowKey: 6f72672e6170616368652e6e757463683a687474702f617069646f63732d322e312f6f72672f6170616368652f6e757463682f70726f746f636f6c2f687474702f6170692f426c6f636b6564457863657074696f6e2e68746d6c => (column=fi, value=2592000, timestamp=1358554170550001) => (column=s, value=1.3605226E-4, timestamp=1358554170551000) => (column=st, value=1, timestamp=1358554170549002) => (column=ts, value=1358554167993, timestamp=1358554170550000) 739 Rows Returned. Elapsed time: 1349 msec(s).カラムファミリ「p」を全件参照。
[default@webpage] list p limit 1000; <省略> Parse Plugins org.apache.nutch.parse.headings Indexing Filter Plugins org.apache.nutch.indexer.anchor An indexing plugin for inbound anchor text. org.apache.nutch.indexer.basic A basic indexing plugin. org.apache.nutch.indexer.feed org.apache.nutch.indexer.metadata org.apache.nutch.indexer.staticfield A simple plugin called at indexing that adds fields with static data. org.apache.nutch.indexer.subcollection org.apache.nutch.indexer.tld Top Level Domain Indexing plugin. org.apache.nutch.indexer.urlmeta URL Meta Tag Indexing Plugin Misc. Plugins org.apache.nutch.analysis.lang Text document language identifier. org.apache.nutch.collection Subcollection is a subset of an index. org.creativecommons.nutch Sample plugins that parse and index Creative Commons medadata. Apache Nutch is an open source web-search software project. Nutch is a project of the Apache Software Foundation and is part of the larger Apache community of developers and users. Overview Package Class Use Tree Deprecated Index Help PREV NEXT FRAMES NO FRAMES All Classes Copyright © 2012 The Apache Software Foundation, timestamp=1358554162452000) => (column=sig, value=o�>SA-����=�Hɏ, timestamp=1358554162451000) => (column=t, value=Overview (apache-nutch 1.6 API), timestamp=1358554162451001) 21 Rows Returned. Elapsed time: 18 msec(s).ちゃんとデータはストアされている模様。
# cd /opt # ls apache-nutch-1.6-bin.tar.gz # tar xvf apache-nutch-1.6-bin.tar.gz
export PATH=$PATH:/opt/apache-nutch-1.6/binで、コマンドを実行してみる
# cd apache-nutch-1.6 # nutch Usage: nutch COMMAND where COMMAND is one of: crawl one-step crawler for intranets (DEPRECATED - USE CRAWL SCRIPT INSTEAD) readdb read / dump crawl db mergedb merge crawldb-s, with optional filtering readlinkdb read / dump link db inject inject new urls into the database generate generate new segments to fetch from crawl db freegen generate new segments to fetch from text files fetch fetch a segment's pages parse parse a segment's pages readseg read / dump segment data mergesegs merge several segments, with optional filtering and slicing updatedb update crawl db from segments after fetching invertlinks create a linkdb from parsed segments mergelinkdb merge linkdb-s, with optional filtering solrindex run the solr indexer on parsed segments and linkdb solrdedup remove duplicates from solr solrclean remove HTTP 301 and 404 documents from solr parsechecker check the parser for a given url indexchecker check the indexing filters for a given url domainstats calculate domain statistics from crawldb webgraph generate a web graph from existing segments linkrank run a link analysis program on the generated web graph scoreupdater updates the crawldb with linkrank scores nodedumper dumps the web graph's node scores plugin load a plugin and run one of its classes main() junit runs the given JUnit test or CLASSNAME run the class named CLASSNAME Most commands print help when invoked w/o parameters.
# vi conf/nutch-default.xmlnutch-default.xml
<property> <name>http.agent.name</name> <value>My Nutch Spider</value> </property>
# mkdir urls # vi urls/seed.txtseed.txt
# nutch crawl urls -dir crawl -depth 1 -topN 2
# cd /opt # ls apache-solr-4.0.0.tgz # tar xvf apache-solr-4.0.0.tgz
# cp /opt/apache-nutch-1.6/conf/schema.xml /opt/apache-solr-4.0.0/example/solr/collection1/conf/schema.xmlが、以下のコマンドでSolrを起動してみるといろいろとエラーが出て起動しない・・・
# cd cd /opt/apache-solr-4.0.0/example/ # java -jar start.jar設定ファイル「schema.xml」を少し書き換える。
# vi solr/collection1/conf/schema.xml追記
<field name="text" type="text" stored="false" indexed="true"/>
<field name="_version_" type="long" indexed="true" stored="true" />コメントアウト
<!-- <filter class="solr.EnglishPorterFilterFactory" protected="protwords.txt"/> -->すると起動するはず。
# nutch crawl urls/ -solr http://localhost:8983/solr/ -depth 2 -topN 50