NAV Navbar
shell python ruby php java

Introduction

Welcome to the semanti.ca Web Article Data Extraction API! You can use our API to scrape news pages, blogs, and online magazine articles. No need to create an individual scraper for every data source: our AI-powered API will automatically detect and extract such elements as title, headline, author(s), publication and update dates, images and their captions, tags/categories, and, of course, text. The API also extracts key phrases from the text as well as classifies the extracted data according to the IPTC Media Topics Taxonomy.

We have language bindings in Shell, Python, Ruby, PHP, and Java. You can view code examples in the dark area to the right, and you can switch the programming language of the examples with the tabs in the top right.

Extracting Data

To extract data from an URL, use this code:

# With shell, you can just pass the correct header with each request
curl https://semanti-ca.cloud.tyk.io/extract-web-article?url=URL_TO_SCRAPE \
  -H "authorization: YOUR_API_KEY"
import requests, urllib
API_key = "YOUR_API_KEY"

API_URL = "https://semanti-ca.cloud.tyk.io"

payload = {"url" : "URL_TO_SCRAPE"}

semanti_ca = requests.Session()

data = semanti_ca.get(API_url + "/extract-web-article?" + urllib.urlencode(payload), headers = {'authorization': API_key}).json()
require 'excon'
require 'addressable/uri'
require 'json'

API_key = 'YOUR_API_KEY'

API_URL = 'https://semanti-ca.cloud.tyk.io'

payload = Addressable::URI.new
payload.query_values = {:url => 'URL_TO_SCRAPE'}

semanti_ca = Excon.new(API_URL, :persistent => true, :headers => { 'authorization': API_key})

@data = JSON.parse(semanti_ca.get(path: '/extract-web-article?' + payload.query).body)
$API_key = 'YOUR_API_KEY';
$API_URL = 'https://semanti-ca.cloud.tyk.io';

$payload = array('url' => 'URL_TO_SCRAPE');

$semanti_ca = curl_init();

$curl_opts = array(
    CURLOPT_URL => $API_URL . '/extract-web-article?' . http_build_query($payload),
    CURLOPT_RETURNTRANSFER => true,
    CURLOPT_HTTPHEADER => array(
        'Content-Type: application/x-www-form-urlencoded',
        'authorization: ' . $API_key
    ),
    CURLOPT_FOLLOWLOCATION => true
);

curl_setopt_array($semanti_ca, $curl_opts);

$data = json_decode(curl_exec($semanti_ca));
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLEncoder;
import java.util.HashMap;
import java.util.Map;

import com.google.gson.JsonElement;
import com.google.gson.JsonObject;
import com.google.gson.JsonParser;

public class SemantiCaBinding {

    final String apiKey = "YOUR_API_KEY";
    final String apiURL = "https://semanti-ca.cloud.tyk.io";

    public static String getParamsString(Map<String, String> params) throws UnsupportedEncodingException {
        StringBuilder result = new StringBuilder();

        for (Map.Entry<String, String> entry : params.entrySet()) {
          result.append(URLEncoder.encode(entry.getKey(), "UTF-8"));
          result.append("=");
          result.append(URLEncoder.encode(entry.getValue(), "UTF-8"));
          result.append("&");
        }

        String resultString = result.toString();
        return resultString.length() > 0 ? resultString.substring(0, resultString.length() - 1) : resultString;
    }

    public JsonObject callSemantiCaAPI(String articleURL) throws IOException {

      Map<String, String> parameters = new HashMap<>();
      parameters.put("url", articleURL);

      URL url = new URL(apiURL + "/extract-web-article?" + getParamsString(parameters));
      HttpURLConnection semantiCa = (HttpURLConnection) url.openConnection();
      semantiCa.setRequestMethod("GET");
      semantiCa.setInstanceFollowRedirects(false);
      semantiCa.setRequestProperty("Content-Type", "application/x-www-form-urlencoded");
      semantiCa.setRequestProperty("authorization", apiKey);
      semantiCa.connect();

      int status = semantiCa.getResponseCode();

      if (status == 200) {
          BufferedReader in = new BufferedReader(new InputStreamReader(semantiCa.getInputStream()));
          String inputLine;
          StringBuffer content = new StringBuffer();
          while ((inputLine = in.readLine()) != null) {
              content.append(inputLine);
          }
          in.close();

          semantiCa.disconnect();

          JsonParser jp = new JsonParser();
          JsonElement root = jp.parse(content.toString());
          JsonObject data = root.getAsJsonObject();

          return data;
      }
      return null;
    }

    public static void main(String[] args) throws IOException {
      SemantiCaBinding semantiCaBinding = new SemantiCaBinding();
      JsonObject data = semantiCaBinding.callSemantiCaAPI("URL_TO_SCRAPE");
    }
}

Make sure to replace URL_TO_SCRAPE with the actual URL of a Web article you want to scrape data from and YOUR_API_KEY with your API key.

semanti.ca uses API keys to allow access to the API. You can subscribe to get a new semanti.ca API key at our developer portal.

semanti.ca expects for the API key to be included in all API requests to the server in a header that looks like the following:

authorization: YOUR_API_KEY

The above command returns JSON structured like this:

{
  "data": {
    "attributes": {
      "author": [
        "Yonghui Wu"
      ],
      "date_published": "Wed, 16 May 2018 00:00:00 GMT",
      "extracted_keyphrase": [
        "smart compose (0.0554162780149)",
        "smart reply (0.0336758325376)",
        "language generation model (0.0263084414913)",
        "model training (0.0192964994525)",
        "hybrid model (0.0182371667555)",
        "model diagram (0.0173146395296)",
        "model complexity (0.0172079127314)",
        "bow model (0.0169959480998)",
        "model hyperparameters (0.0164424591386)"
      ],
      "extracted_topic": [
        "Google Brain",
        "Natural Language Processing",
        "Natural Language Understanding"
      ],
      "image": [
        [
          "https://2.bp.blogspot.com/-KlBuhzV_oFw/WvxP_OAkJ1I/AAAAAAAACu0/T0F6lFZl-2QpS0O7VBMhf8wkUPvnRaPIACLcBGAs/s640/image2.gif",
          "No caption."
        ],
        [
          "https://2.bp.blogspot.com/-ilOCekdQP0Y/WvxdAt6fPZI/AAAAAAAACvE/2_bZTVZt2D8iwSeiKx1rB2rpTVbr_v9KQCLcBGAs/s640/model3.png",
          "Smart Compose RNN-LM model architecture. Subject and previous email message are encoded by averaging the word embeddings in each field. The averaged embeddings are then fed to the RNN-LM at each decoding step."
        ]
      ],
      "predicted_topic": [
        "technology (0.48612168)"
      ],
      "title": "Smart Compose: Using Neural Networks to Help Write Emails"
    },
    "html": "Last week at <a href=\"https://events.google.com/io/\" semanti-ca-content-element=\"1\">Google I/O</a>, we introduced...",
    "text": "Last week at Google I/O , we introduced ..."
  },
  "status": "success"
}

HTTP Request

GET https://semanti-ca.cloud.tyk.io/extract-web-article

Query Parameters

Parameter Description
url Should be a valid URL of a news, online magazine article, or a blog post.

Errors

semanti.ca Web Article Extraction API uses the following error codes:

Error Code Meaning
401 Unauthorized -- Authorization field missing or API key is expired.
403 Forbidden -- Access to this API has been disallowed, API key is inactive, or quota exceeded.
404 Not Found -- The URL could not be found.
405 Method Not Allowed -- You tried to access the API with an invalid method.
429 Too Many Requests -- You're sending too many requests: slow down or subscribe to a higher plan.
500 Internal Server Error -- We had a problem with our server. Try again later.
503 Service Unavailable -- We're temporarily offline for maintenance. Please try again later.