From f259fa7f9f601431c9e2481d1bd546db46e945ef Mon Sep 17 00:00:00 2001
From: MarKoeh <75181140+Mar-Koeh@users.noreply.github.com>
Date: Mon, 10 Jan 2022 11:47:49 +0100
Subject: [PATCH] [ARDMediathekBridge] Switch to JSON-API (#2380)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
* Switch ARDMediathekBridge to JSON-API
The html screen scraping approach of ARDMediathekBridge did not work reliably. I could not find one show for which the item list was not empty using the html screen scraping approach.
The proposed change uses the JSON-API of the WebApp. Although there is still room for improvement (feed title, better understanding of the API, more accurate mimic of the webapp's behavior, de-pagination …), it does work with this change.
Indicate that now full URLs as well as just the ID are accepted.
---
bridges/ARDMediathekBridge.php | 79 ++++++++++++++++++++++++++++------
1 file changed, 67 insertions(+), 12 deletions(-)
diff --git a/bridges/ARDMediathekBridge.php b/bridges/ARDMediathekBridge.php
index 5ac8a41d..e3c9967a 100644
--- a/bridges/ARDMediathekBridge.php
+++ b/bridges/ARDMediathekBridge.php
@@ -4,14 +4,48 @@ class ARDMediathekBridge extends BridgeAbstract {
const URI = 'https://www.ardmediathek.de';
const DESCRIPTION = 'Feed of any series in the ARD-Mediathek, specified by its path';
const MAINTAINER = 'yue-dongchen';
+ /*
+ * Number of Items to be requested from ARDmediathek API
+ * 12 has been observed on the wild
+ * 29 is the highest successfully tested value
+ * More Items could be fetched via pagination
+ * The JSON-field pagination holds more information on that
+ * @const PAGESIZE number of requested items
+ */
+ const PAGESIZE = 29;
+ /*
+ * The URL Prefix of the (Webapp-)API
+ * @const APIENDPOINT https-URL of the used endpoint
+ */
+ const APIENDPOINT = 'https://api.ardmediathek.de/page-gateway/widgets/ard/asset/';
+ /*
+ * The URL prefix of the video link
+ * URLs from the webapp include a slug containing titles of show, episode, and tv station.
+ * It seems to work without that.
+ * @const VIDEOLINKPREFIX https-URL prefix of video links
+ */
+ const VIDEOLINKPREFIX = 'https://www.ardmediathek.de/video/';
+ /*
+ * The requested width of the preview image
+ * 432 has been observed on the wild
+ * The webapp seems to also compute and add the height value
+ * It seems to works without that.
+ * @const IMAGEWIDTH width in px of the preview image
+ */
+ const IMAGEWIDTH = 432;
+ /*
+ * Placeholder that will be replace by IMAGEWIDTH in the preview image URL
+ * @const IMAGEWIDTHPLACEHOLDER
+ */
+ const IMAGEWIDTHPLACEHOLDER = '{width}';
const PARAMETERS = array(
array(
'path' => array(
- 'name' => 'Path',
+ 'name' => 'Show Link or ID',
'required' => true,
- 'title' => 'Enter without trailing slash',
- 'defaultValue' => '45-min/Y3JpZDovL25kci5kZS8xMzkx'
+ 'title' => 'Link to the show page or just its alphanumeric suffix',
+ 'defaultValue' => 'https://www.ardmediathek.de/sendung/45-min/Y3JpZDovL25kci5kZS8xMzkx/'
)
)
);
@@ -19,17 +53,38 @@ class ARDMediathekBridge extends BridgeAbstract {
public function collectData() {
date_default_timezone_set('Europe/Berlin');
- $url = 'https://www.ardmediathek.de/sendung/' . $this->getInput('path') . '/';
- $html = getSimpleHTMLDOM($url);
- $html = defaultLinkTo($html, $url);
+ $pathComponents = explode('/', $this->getInput('path'));
+ if (empty($pathComponents)) {
+ returnClientError('Path may not be empty');
+ }
+ if (count($pathComponents) < 2) {
+ $showID = $pathComponents[0];
+ } else {
+ $lastKey = count($pathComponents) - 1;
+ $showID = $pathComponents[$lastKey];
+ if (strlen($showID) === 0) {
+ $showID = $pathComponents[$lastKey - 1];
+ }
+ }
- foreach($html->find('a.Root-sc-1ytw7qu-0') as $video) {
+ $url = SELF::APIENDPOINT . $showID . '/?pageSize=' . SELF::PAGESIZE;
+ $rawJSON = getContents($url);
+ $processedJSON = json_decode($rawJSON);
+
+ foreach($processedJSON->teasers as $video) {
$item = array();
- $item['uri'] = $video->href;
- $item['title'] = $video->find('h3', 0)->plaintext;
- $item['content'] = '';
- $item['timestamp'] = strtotime(mb_substr($video->find('div.Line-epbftj-1', 0)->plaintext, 0, 10));
-
+ // there is also ->links->self->id, ->links->self->urlId, ->links->target->id, ->links->target->urlId
+ $item['uri'] = SELF::VIDEOLINKPREFIX . $video->id . '/';
+ // there is also ->mediumTitle and ->shortTitle
+ $item['title'] = $video->longTitle;
+ // in the test, aspect16x9 was the only child of images, not sure whether that is always true
+ $item['enclosures'] = array(
+ str_replace(SELF::IMAGEWIDTHPLACEHOLDER, SELF::IMAGEWIDTH, $video->images->aspect16x9->src)
+ );
+ $item['content'] = '
'; + $item['timestamp'] = $video->broadcastedOn; + $item['uid'] = $video->id; + $item['author'] = $video->publicationService->name; $this->items[] = $item; } }