Currently we have some license issues. We are working on it.

Verified Commit 8f4c5e95 authored by dmorley's avatar dmorley
Browse files

detect language from body text of main page

parent 4fe717ba
......@@ -4,7 +4,7 @@ Poduptime is software to get live stats and data on listed Diaspora Pods.
Dependencies:
```
php7.2 php7.2-curl php7.2-pgsql php-geoip php7.2-cli php7.2-common php7.2-json php7.2-readline php-cgi git curl postgresql postgresql-contrib wget dnsutils bind9 npm nodejs nodejs-legacy composer
php7.2 php7.2-curl php7.2-pgsql php-geoip php7.2-cli php7.2-common php7.2-json php7.2-readline php7.2-mbstring php7.2-xml php-cgi git curl postgresql postgresql-contrib wget dnsutils bind9 npm nodejs nodejs-legacy composer
```
To Install:
......
......@@ -3,7 +3,8 @@
"noplanman/xec": "0.1.0",
"gabordemooij/redbean": "^5.0",
"jaybizzle/crawler-detect" :"1.*",
"commerceguys/enum": "^1.0"
"commerceguys/enum": "^1.0",
"patrickschur/language-detection": "^3.3"
},
"autoload": {
"classmap": ["lib"]
......
{
"_readme": [
"This file locks the dependencies of your project to a known state",
"Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies",
"Read more about it at https://getcomposer.org/doc/01-basic-usage.md#composer-lock-the-lock-file",
"This file is @generated automatically"
],
"content-hash": "b753491dc03e2084e9587d0bbafad98c",
"content-hash": "9e58e26526aa53d67d437ddb4f2fcf5e",
"packages": [
{
"name": "commerceguys/enum",
......@@ -185,6 +185,52 @@
"timeout"
],
"time": "2017-03-12T19:16:23+00:00"
},
{
"name": "patrickschur/language-detection",
"version": "v3.3.0",
"source": {
"type": "git",
"url": "https://github.com/patrickschur/language-detection.git",
"reference": "21a2e7a1b9bf6bff578ac11c6dcf3d3668aeccdf"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/patrickschur/language-detection/zipball/21a2e7a1b9bf6bff578ac11c6dcf3d3668aeccdf",
"reference": "21a2e7a1b9bf6bff578ac11c6dcf3d3668aeccdf",
"shasum": ""
},
"require": {
"ext-mbstring": "*",
"php": "^7"
},
"require-dev": {
"phpunit/phpunit": "^6"
},
"type": "library",
"autoload": {
"psr-4": {
"LanguageDetection\\": "src/LanguageDetection"
}
},
"notification-url": "https://packagist.org/downloads/",
"license": [
"MIT"
],
"authors": [
{
"name": "Patrick Schur",
"email": "patrick_schur@outlook.de"
}
],
"description": "A language detection library for PHP. Detects the language from a given text string.",
"homepage": "https://github.com/patrickschur/language-detection",
"keywords": [
"detect",
"detection",
"language"
],
"time": "2018-02-01T17:12:47+00:00"
}
],
"packages-dev": [],
......
ALTER TABLE pods ADD detectedlanguage text;
......@@ -2,6 +2,7 @@
//* Copyright (c) 2011, David Morley. This file is licensed under the Affero General Public License version 3 or later. See the COPYRIGHT file. */
use RedBeanPHP\R;
use LanguageDetection\Language;
$debug = isset($_GET['debug']) || (isset($argv) && in_array('debug', $argv, true));
$newline = PHP_SAPI === 'cli' ? "\n" : '<br>';
......@@ -83,6 +84,16 @@ foreach ($pods as $pod) {
$admin_rating = -1;
}
$d = new DOMDocument;
libxml_use_internal_errors(true);
$d->loadHTMLFile('https://' . $domain);
$body = $d->getElementsByTagName('body')->item(0);
if ($body->nodeValue) {
$ld = new Language;
$detectedlanguage = strtoupper(key($ld->detect($body->nodeValue)->bestResults()->close()));
_debug('Detected Language', $detectedlanguage);
}
$chss = curl_init();
curl_setopt($chss, CURLOPT_URL, 'https://' . $domain . '/nodeinfo/1.0');
curl_setopt($chss, CURLOPT_CONNECTTIMEOUT, 10);
......@@ -279,6 +290,7 @@ foreach ($pods as $pod) {
$p['score'] = $score;
$p['adminrating'] = $admin_rating;
$p['country'] = $country;
$p['detectedlanguage'] = $detectedlanguage;
$p['city'] = $city;
$p['state'] = $state;
$p['lat'] = $lat;
......
......@@ -12,6 +12,7 @@ CREATE TABLE pods (
userrating decimal DEFAULT 0,
hidden boolean DEFAULT true,
ip text,
detectedlanguage text,
country text,
city text,
state text,
......
......@@ -6,7 +6,7 @@ defined('PODUPTIME') || die();
try {
$pods = R::getAll('
SELECT domain, dnssec, podmin_statement, sslexpire, masterversion, shortversion, softwarename, monthsmonitored, score, signup, name, country, city, state, lat, long, uptime_alltime, active_users_halfyear, active_users_monthly, service_facebook, service_twitter, service_tumblr, service_wordpress, service_xmpp, latency, date_updated, ipv6, total_users, local_posts, comment_counts, userrating, status
SELECT domain, dnssec, podmin_statement, sslexpire, masterversion, shortversion, softwarename, monthsmonitored, score, signup, name, country, detectedlanguage, city, state, lat, long, uptime_alltime, active_users_halfyear, active_users_monthly, service_facebook, service_twitter, service_tumblr, service_wordpress, service_xmpp, latency, date_updated, ipv6, total_users, local_posts, comment_counts, userrating, status
FROM pods
WHERE status < ?
ORDER BY weightedscore DESC
......@@ -39,6 +39,7 @@ try {
<th><a data-toggle="tooltip" data-placement="bottom" title="System Score on a 100 point scale.">Score</a></th>
<th><a data-toggle="tooltip" data-placement="bottom" title="Does this domain use DNSSEC.">DNSSEC</a></th>
<th><a data-toggle="tooltip" data-placement="bottom" title="Pod location, based on IP Geolocation.">Country</a></th>
<th><a data-toggle="tooltip" data-placement="bottom" title="Pod language detected from their main page text.">Language</a></th>
<th><a data-toggle="tooltip" data-placement="bottom" title="External Social Networks this pod can post to.">Services</a></th>
<th><a data-toggle="tooltip" data-placement="bottom" title="Click for more information about this pod from the pod host (podmin).">Info</a></th>
</tr>
......@@ -88,6 +89,7 @@ try {
} else {
echo '<td data-toggle="tooltip" data-placement="bottom" title="City: ' . ($pod['city'] ?? 'n/a') . ' State: ' . ($pod['state'] ?? 'n/a') . '">' . $pod['country'] . '</td>';
}
echo '<td>' . ($pod['detectedlanguage'] ? $pod['detectedlanguage'] : '') . '</td>';
echo '<td>';
$pod['service_facebook'] && print '<div class="smlogo smlogo-facebook" title="Publish to Facebook"></div>';
$pod['service_twitter'] && print '<div class="smlogo smlogo-twitter" title="Publish to Twitter"></div>';
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment