CEO of Manticore Software. Passionate about custom search engines and Manticore Software.
wget -nv -r -H -nd --connect-timeout=2 --read-timeout=10 --tries=1 --follow-tags=a -R "*.css*,*.js*,*.png,*.jpg,*.gif" "http://${domain}/" --domains=${domain} | php load.php
<?php
$f = fopen('php://stdin', 'r'); # we'll be waiting for data at STDIN
$manticore = new mysqli('manticore', '', '', '', 9306); # let's connect to Manticore Search via MySQL protocol
$manticore->query("CREATE TABLE IF NOT EXISTS rt(title text, body text, url text stored) html_strip='1' html_remove_elements='style,script,a' morphology='stem_en' index_sp='1'"); /* creating a table "rt" if it doesn't exist with the following settings:
- html_strip='1': stripping HTML is on
- html_remove_elements='style,script,a': for HTML tags <style>/<script>/<a> we don't need their contents, so we are stripping them completely
- morphology='stem_en': we'll use English stemmer as a morphology processor
- index_sp='1': we'll also index sentences and paragraphs for more advanced full-text search capabilities and better relevance
*/
while (!feof($f)) { # reading from STDIN while there's something
$s = fgets($f); /* getting one line. Here is an example of wget returns:
2020-04-08 07:39:33 URL:https://www.who.int/westernpacific/ [98667/98667] -> "index.html.3" [1]
which means that:
- the original URL was https://www.who.int/westernpacific/
- that it saved the contents to index.html.3
*/
if (!preg_match('/URL:(?<url>http.*?) \[.*?\] -> "(?<path>.*?)"/', $s, $match)) continue; # if wget returns smth else we are just skipping it, otherwise we use regexp to put the url and the path to $match
do { # it may be that wget returns the info about a download earlier than the file appears, so we are looping until can read from the file:
$content = @file_get_contents('./'.$match['path']); # reading from the file
usleep(10000); # sleeping 10 milliseconds
} while (!$content); # end the loop when we have the content
if (preg_match('/<title>(?<title>.*?)<\/title>/is', $content, $content_match)) $title = trim(html_entity_decode($content_match['title'])); # here we are doing a simple HTML page parsing to get <title> from that
else continue; # we are not interested in pages without a title
echo "{$match['path']}: $title {$match['url']} ".strlen($content)." bytes\n"; # let's say something about our progress
$manticore->query("REPLACE INTO rt (id,title,url,body) VALUES(".crc32($title).",'".$manticore->escape_string($title)."','".$manticore->escape_string($match['url'])."','".$manticore->escape_string($content)."')"); # and we are finally putting the contents to Manticore. We use crc32(title) as a document ID to avoid duplicates.
} # and we are going back to the next page wget reports as downloaded
Manticore is a lightweight database written in C++ created specifically for search purposes with a powerful full-text search capabilities
services:
manticore:
image: manticoresearch/manticore:3.4.0
Docker Compose is a tool for defining and running multi-container Docker applications. With Compose, you use a YAML file to configure your application’s services. Then, with a single command, you create and start all the services from your configuration
version: '2.2'
services:
# Manticore Search is a small yet powerful database for search with awesome full-text search capabilities
manticore:
# we'll just use their official image
image: manticoresearch/manticore:3.4.0
# and create a volume for data persistency
volumes:
- ./data:/var/lib/manticore
# we also need php
php:
# which we'll build ourselves from Dockerfile
build: php
# no point to run the php container before manticore, hence the dependency
depends_on:
- manticore
# the command below just runs wget to start crawling the domain passed in the env. variable
# and lets the wget output flow to "php load.php" which insert into into Manticore Search
command: /bin/bash -c 'wget -nv -r -H -nd --connect-timeout=2 --read-timeout=10 --tries=1 --follow-tags=a -R "*.css*,*.js*,*.png,*.jpg,*.gif" "http://${domain}/" --domains=${domain} 2>&1 | php load.php'
# let's also add a tiny php script to visualize what we have in Manticore
web:
# we'll use php 7.2. + Apache for that
image: php:7.2-apache
# it also depends on Manticore
depends_on:
- manticore
# let's bind it to 8082 port locally
ports:
- 8082:80
# we'll mirror folder "www" to /var/www/html/ inside the web server container so ./www/index.php will be the front page
volumes:
- ./www/:/var/www/html/
# Let's take php 7.4 as a base image
FROM php:7.4-cli
# We'll also install wget and PHP mysqli extension
RUN apt-get update \
&& apt-get -y install wget \
&& docker-php-source extract \
&& docker-php-ext-install mysqli \
&& docker-php-source delete
# We'll use load.php, so we need to copy it to the image
COPY load.php /usr/src/myapp/
# And let's change the working dir
WORKDIR /usr/src/myapp
snikolaev@dev:~/crawler$ domain=who.int docker-compose up
Starting crawler_manticore_1 … done
Recreating crawler_web_1 … done
Starting crawler_php_1 … done
...
php_1 | data.5: GHO https://www.who.int/data/gho 125537 bytes
php_1 | fact-sheets.4: Fact sheets https://www.who.int/news-room/fact-sheets 83345 bytes
php_1 | facts-in-pictures.3: Facts in pictures https://www.who.int/news-room/facts-in-pictures 70227 bytes
php_1 | publications.7: WHO | Publications https://www.who.int/publications/en/ 92069 bytes
php_1 | questions-answers.3: WHO | Online Q&A https://www.who.int/features/qa/en/ 78145 bytes
php_1 | popular.3: Health topics https://www.who.int/health-topics/ 123263 bytes
php_1 | ebola-virus-disease.8: Ebola virus disease https://www.who.int/health-topics/ebola/ 112116 bytes
<form><h1>Manticore</h1><input name="search" type="text" style="width: 50%; border: 1px solid" value="<?=$_GET['search']?>"></form>
<hr>
<?php
if (isset($_GET['search'])) { # we have a search request, let's process it
$ch = curl_init(); # initializing curl
curl_setopt($ch, CURLOPT_URL,"http://manticore:9308/sql"); # we'll connect to Manticore's /sql endpoint via HTTP. There's also /json/search/ which gives much more granular control, but for the sake of simplicity we'll use the /sql endpoint
curl_setopt($ch, CURLOPT_POST, 1); # we'll send via POST
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); # we need the response back, don't output it
curl_setopt($ch, CURLOPT_POSTFIELDS, "mode=raw&query=SELECT url, highlight({}, 'title') title, highlight({}, 'body') body FROM rt WHERE MATCH('{$_GET['search']}') LIMIT 10"); /* here we are SELECTing :
- url
- highlighted title
- highlighted body
- from the index called "rt"
- we want all documents that MATCH() our search query
- and we need only the first 10, hence LIMIT 10
*/
if ($json = json_decode(curl_exec($ch))) { # running the query and decoding the JSON
foreach ($json->data as $result) echo "<small>{$result->url}</small><br><a href=\"{$result->url}\">{$result->title}</a><br>{$result->body}<br><br>"; # and here we just output the results: url, title and body
}
}
git clone https://github.com/manticoresoftware/demos.git manticore_demos
cd manticore_demos/crawler/
domain=who.int docker-compose up