Unverified Commit 8f4c5e95 authored by dmorley's avatar dmorley

detect language from body text of main page

parent 4fe717ba
......@@ -4,7 +4,7 @@ Poduptime is software to get live stats and data on listed Diaspora Pods.
Dependencies:
```
php7.2 php7.2-curl php7.2-pgsql php-geoip php7.2-cli php7.2-common php7.2-json php7.2-readline php-cgi git curl postgresql postgresql-contrib wget dnsutils bind9 npm nodejs nodejs-legacy composer
php7.2 php7.2-curl php7.2-pgsql php-geoip php7.2-cli php7.2-common php7.2-json php7.2-readline php7.2-mbstring php7.2-xml php-cgi git curl postgresql postgresql-contrib wget dnsutils bind9 npm nodejs nodejs-legacy composer
```
To Install:
......
......@@ -3,7 +3,8 @@
"noplanman/xec": "0.1.0",
"gabordemooij/redbean": "^5.0",
"jaybizzle/crawler-detect" :"1.*",
"commerceguys/enum": "^1.0"
"commerceguys/enum": "^1.0",
"patrickschur/language-detection": "^3.3"
},
"autoload": {
"classmap": ["lib"]
......
{
"_readme": [
"This file locks the dependencies of your project to a known state",
"Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies",
"Read more about it at https://getcomposer.org/doc/01-basic-usage.md#composer-lock-the-lock-file",
"This file is @generated automatically"
],
"content-hash": "b753491dc03e2084e9587d0bbafad98c",
"content-hash": "9e58e26526aa53d67d437ddb4f2fcf5e",
"packages": [
{
"name": "commerceguys/enum",
......@@ -185,6 +185,52 @@
"timeout"
],
"time": "2017-03-12T19:16:23+00:00"
},
{
"name": "patrickschur/language-detection",
"version": "v3.3.0",
"source": {
"type": "git",
"url": "https://github.com/patrickschur/language-detection.git",
"reference": "21a2e7a1b9bf6bff578ac11c6dcf3d3668aeccdf"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/patrickschur/language-detection/zipball/21a2e7a1b9bf6bff578ac11c6dcf3d3668aeccdf",
"reference": "21a2e7a1b9bf6bff578ac11c6dcf3d3668aeccdf",
"shasum": ""
},
"require": {
"ext-mbstring": "*",
"php": "^7"
},
"require-dev": {
"phpunit/phpunit": "^6"
},
"type": "library",
"autoload": {
"psr-4": {
"LanguageDetection\\": "src/LanguageDetection"
}
},
"notification-url": "https://packagist.org/downloads/",
"license": [
"MIT"
],
"authors": [
{
"name": "Patrick Schur",
"email": "patrick_schur@outlook.de"
}
],
"description": "A language detection library for PHP. Detects the language from a given text string.",
"homepage": "https://github.com/patrickschur/language-detection",
"keywords": [
"detect",
"detection",
"language"
],
"time": "2018-02-01T17:12:47+00:00"
}
],
"packages-dev": [],
......
ALTER TABLE pods ADD detectedlanguage text;
......@@ -2,6 +2,7 @@
//* Copyright (c) 2011, David Morley. This file is licensed under the Affero General Public License version 3 or later. See the COPYRIGHT file. */
use RedBeanPHP\R;
use LanguageDetection\Language;
$debug = isset($_GET['debug']) || (isset($argv) && in_array('debug', $argv, true));
$newline = PHP_SAPI === 'cli' ? "\n" : '<br>';
......@@ -83,6 +84,16 @@ foreach ($pods as $pod) {
$admin_rating = -1;
}
$d = new DOMDocument;
libxml_use_internal_errors(true);
$d->loadHTMLFile('https://' . $domain);
$body = $d->getElementsByTagName('body')->item(0);
if ($body->nodeValue) {
$ld = new Language;
$detectedlanguage = strtoupper(key($ld->detect($body->nodeValue)->bestResults()->close()));
_debug('Detected Language', $detectedlanguage);
}
$chss = curl_init();
curl_setopt($chss, CURLOPT_URL, 'https://' . $domain . '/nodeinfo/1.0');
curl_setopt($chss, CURLOPT_CONNECTTIMEOUT, 10);
......@@ -279,6 +290,7 @@ foreach ($pods as $pod) {
$p['score'] = $score;
$p['adminrating'] = $admin_rating;
$p['country'] = $country;
$p['detectedlanguage'] = $detectedlanguage;
$p['city'] = $city;
$p['state'] = $state;
$p['lat'] = $lat;
......
......@@ -12,6 +12,7 @@ CREATE TABLE pods (
userrating decimal DEFAULT 0,
hidden boolean DEFAULT true,
ip text,
detectedlanguage text,
country text,
city text,
state text,
......
......@@ -6,7 +6,7 @@ defined('PODUPTIME') || die();
try {
$pods = R::getAll('
SELECT domain, dnssec, podmin_statement, sslexpire, masterversion, shortversion, softwarename, monthsmonitored, score, signup, name, country, city, state, lat, long, uptime_alltime, active_users_halfyear, active_users_monthly, service_facebook, service_twitter, service_tumblr, service_wordpress, service_xmpp, latency, date_updated, ipv6, total_users, local_posts, comment_counts, userrating, status
SELECT domain, dnssec, podmin_statement, sslexpire, masterversion, shortversion, softwarename, monthsmonitored, score, signup, name, country, detectedlanguage, city, state, lat, long, uptime_alltime, active_users_halfyear, active_users_monthly, service_facebook, service_twitter, service_tumblr, service_wordpress, service_xmpp, latency, date_updated, ipv6, total_users, local_posts, comment_counts, userrating, status
FROM pods
WHERE status < ?
ORDER BY weightedscore DESC
......@@ -39,6 +39,7 @@ try {
<th><a data-toggle="tooltip" data-placement="bottom" title="System Score on a 100 point scale.">Score</a></th>
<th><a data-toggle="tooltip" data-placement="bottom" title="Does this domain use DNSSEC.">DNSSEC</a></th>
<th><a data-toggle="tooltip" data-placement="bottom" title="Pod location, based on IP Geolocation.">Country</a></th>
<th><a data-toggle="tooltip" data-placement="bottom" title="Pod language detected from their main page text.">Language</a></th>
<th><a data-toggle="tooltip" data-placement="bottom" title="External Social Networks this pod can post to.">Services</a></th>
<th><a data-toggle="tooltip" data-placement="bottom" title="Click for more information about this pod from the pod host (podmin).">Info</a></th>
</tr>
......@@ -88,6 +89,7 @@ try {
} else {
echo '<td data-toggle="tooltip" data-placement="bottom" title="City: ' . ($pod['city'] ?? 'n/a') . ' State: ' . ($pod['state'] ?? 'n/a') . '">' . $pod['country'] . '</td>';
}
echo '<td>' . ($pod['detectedlanguage'] ? $pod['detectedlanguage'] : '') . '</td>';
echo '<td>';
$pod['service_facebook'] && print '<div class="smlogo smlogo-facebook" title="Publish to Facebook"></div>';
$pod['service_twitter'] && print '<div class="smlogo smlogo-twitter" title="Publish to Twitter"></div>';
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment