From 4d6046399456dd4ad826511144cc4eec5ec15694 Mon Sep 17 00:00:00 2001 From: "Eric Chen (from Dev Box)" Date: Fri, 6 Mar 2026 04:05:50 -0600 Subject: [PATCH 01/32] Client-side migration: replace server APIs with local alternatives Replace two server-side OneNote APIs with client-side implementations: 1. Article Extraction: Replace augmentation API with @mozilla/readability - Local Readability.js parsing instead of server POST - FullPage as default clip mode (no domain whitelist) 2. Full Page Screenshot: Replace DomEnhancer API with renderer window - Scroll-capture via captureVisibleTab in focused popup window - Canvas stitching with DPR-aware overlap detection - Binary MIME part upload (no base64 overhead) - URL rewriting for images, stylesheets, srcset, CSS url() - Fixed/sticky position neutralization - Mode-switch cancel/retry mechanism - Session storage cleanup Known limitations: - External CSS not inlined (next priority) - Renderer window must stay visible (captureVisibleTab) - Canvas height capped at 16384px, storage quota 10MB - Test infrastructure uses deprecated PhantomJS Co-Authored-By: Claude Opus 4.6 (1M context) --- THIRD-PARTY-NOTICES.txt | 18 ++ docs/client-side-migration.md | 225 ++++++++++++++++++ gulpfile.js | 28 ++- package-lock.json | 29 ++- package.json | 1 + src/renderer.html | 18 ++ src/scripts/clipperUI/clipper.tsx | 21 +- .../previewViewer/fullPagePreview.tsx | 17 +- src/scripts/constants.ts | 4 +- .../contentCapture/augmentationHelper.ts | 117 ++++----- .../fullPageScreenshotHelper.ts | 204 ++++++++++++---- src/scripts/contentCapture/readability.d.ts | 30 +++ .../extensions/bookmarklet/inlineWorker.ts | 4 + src/scripts/extensions/chrome/manifest.json | 3 +- src/scripts/extensions/edge/manifest.json | 3 +- src/scripts/extensions/extensionWorkerBase.ts | 21 ++ src/scripts/extensions/safari/safariWorker.ts | 8 + .../webExtensionBase/webExtensionWorker.ts | 166 +++++++++++++ src/scripts/renderer.ts | 211 ++++++++++++++++ .../saveToOneNote/oneNoteSaveableFactory.ts | 13 +- .../previewViewer/fullPagePreview_tests.tsx | 12 +- .../augmentationHelper_tests.ts | 130 +++------- .../oneNoteSaveableFactory_tests.ts | 2 +- 23 files changed, 1038 insertions(+), 247 deletions(-) create mode 100644 docs/client-side-migration.md create mode 100644 src/renderer.html create mode 100644 src/scripts/contentCapture/readability.d.ts create mode 100644 src/scripts/renderer.ts diff --git a/THIRD-PARTY-NOTICES.txt b/THIRD-PARTY-NOTICES.txt index 357754f5..46fe340b 100644 --- a/THIRD-PARTY-NOTICES.txt +++ b/THIRD-PARTY-NOTICES.txt @@ -303,3 +303,21 @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +------------------------------------------- + +@mozilla/readability + +Copyright (c) 2010 Arc90 Inc + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. diff --git a/docs/client-side-migration.md b/docs/client-side-migration.md new file mode 100644 index 00000000..4ebb0a6e --- /dev/null +++ b/docs/client-side-migration.md @@ -0,0 +1,225 @@ +# WebClipper Client-Side Migration + +## Overview + +This document tracks the experiment to remove server-side dependencies from the OneNote Web Clipper's content processing pipeline and replace them with client-side alternatives. The goal is a fully self-contained browser extension that does not rely on the OneNote augmentation/screenshot server APIs. + +## Server APIs Removed + +### 1. Augmentation API +- **Endpoint:** `onenote.com/onaugmentation/clipperextract/v1.0/` +- **Purpose:** Server-side article/recipe/product extraction using ML models +- **Replacement:** Mozilla Readability (`@mozilla/readability`, Apache 2.0 license) +- **Status:** Complete + +### 2. Full Page Screenshot API (DomEnhancer) +- **Endpoint:** `onenote.com/onaugmentation/clipperDomEnhancer/v1.0/` +- **Purpose:** Server-side Puppeteer rendering of page DOM into full-page screenshots +- **Replacement:** Client-side renderer window with scroll-capture and canvas stitching +- **Status:** Functional, with known issues (see below) + +--- + +## Change 1: Article Extraction with Readability.js + +### What Changed +- `augmentationHelper.ts` — Rewrote `augmentPage()` to use `new Readability(doc).parse()` locally instead of POSTing to the server API +- Removed `makeAugmentationRequest()` method entirely +- Removed imports: `HttpWithRetries`, `OneNoteApiUtils`, `Settings`, `Constants` (URL refs) +- Added metadata mapping: Readability's `title`, `excerpt`, `byline`, `siteName`, `publishedTime` are stored in `PageMetadata` + +### Why Readability.js +- Apache 2.0 license (compatible with WebClipper's MIT license; repo already has Apache 2.0 deps like pdfjs-dist) +- Well-maintained by Mozilla, used in Firefox Reader View +- Produces clean article HTML similar to what the server API returned + +### Other Related Changes +- `clipper.tsx` — Removed `UrlUtils.onWhitelistedDomain()` check that gated augmentation mode; FullPage is now the default clip mode +- `constants.ts` — Removed `augmentationApiUrl` constant +- `readability.d.ts` (new) — TypeScript type declarations for `@mozilla/readability` +- `package.json` — Added `@mozilla/readability` dependency +- `augmentationHelper_tests.ts` — Updated tests for new local implementation + +--- + +## Change 2: Full Page Screenshot with Renderer Window + +### Architecture + +The server-side approach used Puppeteer to render sanitized HTML and produce a full-page screenshot. The client-side replacement mirrors this: + +1. **Store HTML in `chrome.storage.session`** — The page's HTML content, base URL, and localized status text are written to session storage (avoids JSON serialization bottleneck with large payloads) +2. **Open a renderer popup window** — An extension page (`renderer.html`) is opened at the same position/size as the user's browser with `focused: true`. Width is capped at 1280px. Zoom is forced to 100% via `chrome.tabs.setZoom`. Title bar shows localized "Clipping Page" status text +3. **Port-based communication** — The renderer page connects to the service worker via `chrome.runtime.connect({ name: "renderer" })`. Commands (loadContent, scroll) are exchanged over this port +4. **Renderer loads content** — Reads HTML from `chrome.storage.session`, strips ` + + diff --git a/src/scripts/clipperUI/clipper.tsx b/src/scripts/clipperUI/clipper.tsx index 57511271..320279be 100644 --- a/src/scripts/clipperUI/clipper.tsx +++ b/src/scripts/clipperUI/clipper.tsx @@ -8,7 +8,6 @@ import {PageInfo} from "../pageInfo"; import {Polyfills} from "../polyfills"; import {PreviewGlobalInfo} from "../previewInfo"; import {TooltipType} from "./tooltipType"; -import {UrlUtils} from "../urlUtils"; import {Communicator} from "../communicator/communicator"; import {IFrameMessageHandler} from "../communicator/iframeMessageHandler"; @@ -232,6 +231,9 @@ class ClipperClass extends ComponentBase { } private captureFullPageScreenshotContent() { + if (this.state.fullPageResult && this.state.fullPageResult.status === Status.InProgress) { + return; + } if (this.state.pageInfo.contentType === OneNoteApi.ContentType.EnhancedUrl) { this.state.setState({ fullPageResult: { @@ -244,7 +246,7 @@ class ClipperClass extends ComponentBase { } else { this.state.setState({ fullPageResult: { status: Status.InProgress } }); - FullPageScreenshotHelper.getFullPageScreenshot(this.state.pageInfo.contentData).then((result) => { + FullPageScreenshotHelper.getFullPageScreenshot(this.state.pageInfo.contentData, this.state.pageInfo.rawUrl).then((result) => { this.state.setState({ fullPageResult: { data: result, status: Status.Succeeded } }); }, () => { this.state.setState({ @@ -532,8 +534,20 @@ class ClipperClass extends ComponentBase { private initializeSmartValues() { this.state.currentMode.subscribe((newMode: ClipMode) => { + if (newMode !== ClipMode.FullPage && this.state.fullPageResult && this.state.fullPageResult.status === Status.InProgress) { + // Cancel in-progress screenshot when switching away from FullPage + Clipper.getExtensionCommunicator().callRemoteFunction(Constants.FunctionKeys.cancelFullPageScreenshot); + this.state.setState({ fullPageResult: { status: Status.Failed } }); + } + switch (newMode) { case ClipMode.FullPage: + Clipper.getInjectCommunicator().callRemoteFunction(Constants.FunctionKeys.updatePageInfoIfUrlChanged); + // Retry screenshot if previous attempt failed + if (!this.state.fullPageResult || this.state.fullPageResult.status === Status.Failed || this.state.fullPageResult.status === Status.NotStarted) { + this.captureFullPageScreenshotContent(); + } + break; case ClipMode.Augmentation: Clipper.getInjectCommunicator().callRemoteFunction(Constants.FunctionKeys.updatePageInfoIfUrlChanged); break; @@ -571,9 +585,6 @@ class ClipperClass extends ComponentBase { return ClipMode.Pdf; } - if (UrlUtils.onWhitelistedDomain(this.state.pageInfo.rawUrl)) { - return ClipMode.Augmentation; - } } return ClipMode.FullPage; diff --git a/src/scripts/clipperUI/components/previewViewer/fullPagePreview.tsx b/src/scripts/clipperUI/components/previewViewer/fullPagePreview.tsx index cf56ea8e..2fd5365b 100644 --- a/src/scripts/clipperUI/components/previewViewer/fullPagePreview.tsx +++ b/src/scripts/clipperUI/components/previewViewer/fullPagePreview.tsx @@ -1,9 +1,6 @@ import {Constants} from "../../../constants"; -import {SmartValue} from "../../../communicator/smartValue"; - import {FullPageScreenshotResult} from "../../../contentCapture/fullPageScreenshotHelper"; -import {PdfScreenshotResult} from "../../../contentCapture/pdfScreenshotHelper"; import {ExtensionUtils} from "../../../extensions/extensionUtils"; @@ -18,6 +15,8 @@ import {PreviewComponentBase} from "./previewComponentBase"; import {PreviewViewerFullPageHeader} from "./previewViewerFullPageHeader"; class FullPagePreview extends PreviewComponentBase<{}, ClipperStateProp> { + private currentObjectUrl: string = ""; + protected getContentBodyForCurrentStatus(): any[] { let state = this.props.clipperState; @@ -59,7 +58,8 @@ class FullPagePreview extends PreviewComponentBase<{}, ClipperStateProp> { return Localization.getLocalizedString("WebClipper.Preview.LoadingMessage"); default: case Status.Failed: - failureMessage = this.props.clipperState.fullPageResult.data.failureMessage; + let resultData = this.props.clipperState.fullPageResult.data; + failureMessage = resultData ? resultData.failureMessage : undefined; return !!failureMessage ? failureMessage : noContentFoundString; } } @@ -74,9 +74,12 @@ class FullPagePreview extends PreviewComponentBase<{}, ClipperStateProp> { if (this.props.clipperState.fullPageResult.data) { let screenshotImages: FullPageScreenshotResult = this.props.clipperState.fullPageResult.data; - for (let imageData of screenshotImages.Images) { - let dataUrl = "data:image/" + screenshotImages.ImageFormat + ";" + screenshotImages.ImageEncoding + "," + imageData; - contentBody.push({altTag}); + if (screenshotImages.ImageBlob) { + if (this.currentObjectUrl) { + URL.revokeObjectURL(this.currentObjectUrl); + } + this.currentObjectUrl = URL.createObjectURL(screenshotImages.ImageBlob); + contentBody.push({altTag}); } } break; diff --git a/src/scripts/constants.ts b/src/scripts/constants.ts index c862082e..e2b43aa1 100644 --- a/src/scripts/constants.ts +++ b/src/scripts/constants.ts @@ -330,6 +330,8 @@ export module Constants { export var signOutUser = "SIGN_OUT_USER"; export var tabToLowestIndexedElement = "TAB_TO_LOWEST_INDEXED_ELEMENT"; export var takeTabScreenshot = "TAKE_TAB_SCREENSHOT"; + export var takeFullPageScreenshot = "TAKE_FULL_PAGE_SCREENSHOT"; + export var cancelFullPageScreenshot = "CANCEL_FULL_PAGE_SCREENSHOT"; export var telemetry = "TELEMETRY"; export var toggleClipper = "TOGGLE_CLIPPER"; export var unloadHandler = "UNLOAD_HANDLER"; @@ -382,9 +384,7 @@ export module Constants { export module Urls { export var serviceDomain = "https://www.onenote.com"; - export var augmentationApiUrl = serviceDomain + "/onaugmentation/clipperextract/v1.0/"; export var changelogUrl = serviceDomain + "/whatsnext/webclipper"; - export var fullPageScreenshotUrl = serviceDomain + "/onaugmentation/clipperDomEnhancer/v1.0/"; export var localizedStringsUrlBase = serviceDomain + "/strings?ids=WebClipper."; export var clipperInstallPageUrl = "https://support.microsoft.com/en-us/office/getting-started-with-the-onenote-web-clipper-5696609d-c5ae-4591-b3af-1f897cb6eda6"; diff --git a/src/scripts/contentCapture/augmentationHelper.ts b/src/scripts/contentCapture/augmentationHelper.ts index b5c7ee58..52d21fb0 100644 --- a/src/scripts/contentCapture/augmentationHelper.ts +++ b/src/scripts/contentCapture/augmentationHelper.ts @@ -1,20 +1,16 @@ -import {Constants} from "../constants"; -import {Settings} from "../settings"; import {StringUtils} from "../stringUtils"; import {ObjectUtils} from "../objectUtils"; import {Clipper} from "../clipperUI/frontEndGlobals"; import {ClipperState} from "../clipperUI/clipperState"; -import {OneNoteApiUtils} from "../clipperUI/oneNoteApiUtils"; import {DomUtils, EmbeddedVideoIFrameSrcs} from "../domParsers/domUtils"; -import {HttpWithRetries} from "../http/httpWithRetries"; - import * as Log from "../logging/log"; import {CaptureFailureInfo} from "./captureFailureInfo"; -import { ErrorUtils, ResponsePackage } from "../responsePackage"; + +import {Readability} from "@mozilla/readability"; export enum AugmentationModel { None, @@ -38,43 +34,65 @@ export class AugmentationHelper { public static augmentPage(url: string, locale: string, pageContent: string): Promise { return new Promise((resolve, reject) => { let augmentationEvent = new Log.Event.PromiseEvent(Log.Event.Label.AugmentationApiCall); - - let correlationId = StringUtils.generateGuid(); - augmentationEvent.setCustomProperty(Log.PropertyName.Custom.CorrelationId, correlationId); - - AugmentationHelper.makeAugmentationRequest(url, locale, pageContent, correlationId).then((responsePackage: { parsedResponse: AugmentationResult[], response: Response }) => { - let parsedResponse = responsePackage.parsedResponse; - let result: AugmentationResult = { ContentModel: AugmentationModel.None, ContentObjects: [] }; - - augmentationEvent.setCustomProperty(Log.PropertyName.Custom.CorrelationId, responsePackage.response.headers.get(Constants.HeaderValues.correlationId)); - - if (parsedResponse && parsedResponse.length > 0 && parsedResponse[0].ContentInHtml) { - result = parsedResponse[0]; + augmentationEvent.setCustomProperty(Log.PropertyName.Custom.CorrelationId, StringUtils.generateGuid()); + + try { + let result: AugmentationResult = { ContentModel: AugmentationModel.None, ContentObjects: [] }; + + // Parse the page HTML into a Document for Readability + let doc = (new DOMParser()).parseFromString(pageContent, "text/html"); + + // Clone the document because Readability mutates it + let docClone = doc.cloneNode(true) as Document; + + let reader = new Readability(docClone, { charThreshold: 100 }); + let article = reader.parse(); + + if (article && article.content) { + result.ContentInHtml = article.content; + result.ContentModel = AugmentationModel.Article; + result.ContentObjects = []; + + let metadata: { [key: string]: string } = {}; + if (article.title) { + metadata.title = article.title; + } + if (article.excerpt) { + metadata.description = article.excerpt; + } + if (article.byline) { + metadata.author = article.byline; + } + if (article.siteName) { + metadata.siteName = article.siteName; + } + if (article.publishedTime) { + metadata.publishedTime = article.publishedTime; + } + result.PageMetadata = metadata; augmentationEvent.setCustomProperty(Log.PropertyName.Custom.AugmentationModel, AugmentationModel[result.ContentModel]); // Remove tags that are unsupported by ONML before we display them in the preview - // Supported tags: https://msdn.microsoft.com/en-us/library/office/dn575442.aspx - let doc = (new DOMParser()).parseFromString(result.ContentInHtml, "text/html"); - let previewElement = AugmentationHelper.getArticlePreviewElement(doc); + let contentDoc = (new DOMParser()).parseFromString(result.ContentInHtml, "text/html"); + let previewElement = AugmentationHelper.getArticlePreviewElement(contentDoc); - DomUtils.toOnml(doc).then(async () => { + DomUtils.toOnml(contentDoc).then(async () => { DomUtils.addPreviewContainerStyling(previewElement); await AugmentationHelper.addSupportedVideosToElement(previewElement, pageContent, url); - result.ContentInHtml = doc.body.innerHTML; + result.ContentInHtml = contentDoc.body.innerHTML; resolve(result); }); } else { resolve(result); } - - augmentationEvent.setCustomProperty(Log.PropertyName.Custom.AugmentationModel, AugmentationModel[result.ContentModel]); - }).catch((failure: OneNoteApi.RequestError) => { - OneNoteApiUtils.logOneNoteApiRequestError(augmentationEvent, failure); + } catch (e) { + augmentationEvent.setStatus(Log.Status.Failed); + augmentationEvent.setFailureInfo({ error: e.message || "Readability parsing failed" }); reject(); - }).then(() => { - Clipper.logger.logEvent(augmentationEvent); - }); + } + + Clipper.logger.logEvent(augmentationEvent); }); } @@ -86,8 +104,6 @@ export class AugmentationHelper { return augmentationType; } - // TODO: There is a work-item to change the AugmentationApi to return ContentModel as a StringUtils - // instead of an integer let contentModel: AugmentationModel = state.augmentationResult.data.ContentModel; if (AugmentationHelper.isSupportedAugmentationType(contentModel)) { @@ -97,43 +113,6 @@ export class AugmentationHelper { return augmentationType; } - /* - * Returns the augmented preview text. - */ - public static makeAugmentationRequest(url: string, locale: string, pageContent: string, requestCorrelationId: string): Promise> { - return new Promise>((resolve, reject) => { - Clipper.getUserSessionIdWhenDefined().then((sessionId) => { - let augmentationApiUrl = Constants.Urls.augmentationApiUrl + "?renderMethod=extractAggressive&url=" + url + "&lang=" + locale; - - let headers = {}; - headers[Constants.HeaderValues.appIdKey] = Settings.getSetting("App_Id"); - headers[Constants.HeaderValues.noAuthKey] = "true"; - headers[Constants.HeaderValues.correlationId] = requestCorrelationId; - headers[Constants.HeaderValues.userSessionIdKey] = sessionId; - - HttpWithRetries.post(augmentationApiUrl, pageContent, headers).then((response: Response) => { - response.text().then((responseText: string) => { - let parsedResponse: any; - try { - parsedResponse = JSON.parse(responseText); - } catch (e) { - Clipper.logger.logJsonParseUnexpected(responseText); - ErrorUtils.createRequestErrorObject(response, OneNoteApi.RequestErrorType.UNABLE_TO_PARSE_RESPONSE).then((error) => { - reject(error); - }); - } - - let responsePackage = { - parsedResponse: parsedResponse, - response: response - }; - resolve(responsePackage); - }); - }); - }); - }); - } - public static getArticlePreviewElement(doc: Document): HTMLElement { let mainContainers = doc.getElementsByClassName("MainArticleContainer"); if (ObjectUtils.isNullOrUndefined(mainContainers) || ObjectUtils.isNullOrUndefined(mainContainers[0])) { diff --git a/src/scripts/contentCapture/fullPageScreenshotHelper.ts b/src/scripts/contentCapture/fullPageScreenshotHelper.ts index 505e65f4..db1bc1b6 100644 --- a/src/scripts/contentCapture/fullPageScreenshotHelper.ts +++ b/src/scripts/contentCapture/fullPageScreenshotHelper.ts @@ -1,69 +1,179 @@ import {Clipper} from "../clipperUI/frontEndGlobals"; -import {OneNoteApiUtils} from "../clipperUI/oneNoteApiUtils"; - -import {HttpWithRetries} from "../http/httpWithRetries"; import * as Log from "../logging/log"; import {Constants} from "../constants"; -import {Settings} from "../settings"; +import {Localization} from "../localization/localization"; import {StringUtils} from "../stringUtils"; import {CaptureFailureInfo} from "./captureFailureInfo"; -import { ErrorUtils } from "../responsePackage"; export interface FullPageScreenshotResult extends CaptureFailureInfo { - ImageEncoding?: string; ImageFormat?: string; - Images?: string[]; + ImageBlob?: Blob; + ImageWidth?: number; } -export class FullPageScreenshotHelper { - private static timeout = 50000; +interface ScrollData { + scrollPositions: number[]; + viewportHeight: number; +} - public static getFullPageScreenshot(pageInfoContentData: string): Promise { +export class FullPageScreenshotHelper { + public static getFullPageScreenshot(pageInfoContentData: string, pageUrl?: string): Promise { return new Promise((resolve, reject) => { - Clipper.getUserSessionIdWhenDefined().then((sessionId) => { - let fullPageScreenshotEvent = new Log.Event.PromiseEvent(Log.Event.Label.FullPageScreenshotCall); - - let correlationId = StringUtils.generateGuid(); - fullPageScreenshotEvent.setCustomProperty(Log.PropertyName.Custom.CorrelationId, correlationId); - - let headers = {}; - headers[Constants.HeaderValues.accept] = "application/json"; - headers[Constants.HeaderValues.appIdKey] = Settings.getSetting("App_Id"); - headers[Constants.HeaderValues.noAuthKey] = "true"; - headers[Constants.HeaderValues.correlationId] = correlationId; - headers[Constants.HeaderValues.userSessionIdKey] = sessionId; - - let errorCallback = (error: OneNoteApi.RequestError) => { - fullPageScreenshotEvent.setCustomProperty(Log.PropertyName.Custom.CorrelationId, error.responseHeaders[Constants.HeaderValues.correlationId]); - OneNoteApiUtils.logOneNoteApiRequestError(fullPageScreenshotEvent, error); - }; + let fullPageScreenshotEvent = new Log.Event.PromiseEvent(Log.Event.Label.FullPageScreenshotCall); + let correlationId = StringUtils.generateGuid(); + fullPageScreenshotEvent.setCustomProperty(Log.PropertyName.Custom.CorrelationId, correlationId); - HttpWithRetries.post(Constants.Urls.fullPageScreenshotUrl, pageInfoContentData, headers, [200, 204], FullPageScreenshotHelper.timeout).then((response: Response) => { - if (response.status === 200) { - response.text().then((responseText: string) => { - try { - resolve(JSON.parse(responseText) as FullPageScreenshotResult); - fullPageScreenshotEvent.setCustomProperty(Log.PropertyName.Custom.FullPageScreenshotContentFound, true); - } catch (e) { - ErrorUtils.createRequestErrorObject(response, OneNoteApi.RequestErrorType.UNABLE_TO_PARSE_RESPONSE, FullPageScreenshotHelper.timeout).then((error) => { - reject(error); - }); + let statusText = Localization.getLocalizedString("WebClipper.ClipType.ScreenShot.ProgressLabel") || "Capturing page..."; + let storageData: any = { fullPageHtmlContent: pageInfoContentData, fullPageStatusText: statusText }; + if (pageUrl) { + storageData.fullPageBaseUrl = pageUrl; + } + + chrome.storage.session.set(storageData, () => { + Clipper.getExtensionCommunicator().callRemoteFunction( + Constants.FunctionKeys.takeFullPageScreenshot, { + callback: (signal: any) => { + if (!signal || !signal.success) { + fullPageScreenshotEvent.setCustomProperty(Log.PropertyName.Custom.FullPageScreenshotContentFound, false); + Clipper.logger.logEvent(fullPageScreenshotEvent); + reject(); + return; } - }); - } else { - fullPageScreenshotEvent.setCustomProperty(Log.PropertyName.Custom.FullPageScreenshotContentFound, false); - reject(); + + chrome.storage.session.get(["fullPageScreenshots", "fullPageScrollData"], (stored: any) => { + let dataUrls: string[] = stored && stored.fullPageScreenshots ? stored.fullPageScreenshots : []; + let scrollData: ScrollData = stored && stored.fullPageScrollData ? stored.fullPageScrollData : undefined; + + chrome.storage.session.remove([ + "fullPageHtmlContent", "fullPageBaseUrl", "fullPageStatusText", + "fullPageScreenshots", "fullPageScrollData" + ]); + + if (dataUrls.length > 0) { + FullPageScreenshotHelper.stitchImages(dataUrls, scrollData).then((imageBlob) => { + let result: FullPageScreenshotResult = { + ImageFormat: signal.format || "jpeg", + ImageBlob: imageBlob, + ImageWidth: signal.cssWidth + }; + + fullPageScreenshotEvent.setCustomProperty(Log.PropertyName.Custom.FullPageScreenshotContentFound, true); + Clipper.logger.logEvent(fullPageScreenshotEvent); + resolve(result); + }, () => { + fullPageScreenshotEvent.setCustomProperty(Log.PropertyName.Custom.FullPageScreenshotContentFound, false); + Clipper.logger.logEvent(fullPageScreenshotEvent); + reject(); + }); + } else { + fullPageScreenshotEvent.setCustomProperty(Log.PropertyName.Custom.FullPageScreenshotContentFound, false); + Clipper.logger.logEvent(fullPageScreenshotEvent); + reject(); + } + }); + } } - }, (error: OneNoteApi.RequestError) => { - errorCallback(error); - reject(); - }).then(() => { - Clipper.logger.logEvent(fullPageScreenshotEvent); - }); + ); }); }); } + + /** + * Stitches multiple viewport screenshots into a single image, cropping overlaps. + * Uses scroll position data to detect where captures overlap (last capture is + * typically clamped by the browser, causing duplication with the previous one). + */ + private static stitchImages(dataUrls: string[], scrollData: ScrollData): Promise { + return new Promise((resolve, reject) => { + let images: HTMLImageElement[] = []; + let loaded = 0; + + let onAllLoaded = () => { + let totalWidth = images[0].naturalWidth; + let imgHeight = images[0].naturalHeight; + + // Device pixel ratio: captured images may be larger than CSS viewport + let dpr = scrollData ? imgHeight / scrollData.viewportHeight : 1; + + // Calculate the visible (non-overlapping) portion of each capture + let slices: { img: HTMLImageElement; srcY: number; height: number }[] = []; + + if (scrollData && scrollData.scrollPositions.length === images.length) { + let positions = scrollData.scrollPositions; + + for (let i = 0; i < images.length; i++) { + if (i === 0) { + // First capture: full image + slices.push({ img: images[i], srcY: 0, height: imgHeight }); + } else { + // Calculate overlap in CSS pixels, then scale to image pixels + let expectedScroll = i * scrollData.viewportHeight; + let actualScroll = positions[i]; + let overlapCss = expectedScroll - actualScroll; + let overlapPx = Math.round(overlapCss * dpr); + + if (overlapPx > 0 && overlapPx < imgHeight) { + // Crop the overlapping top portion + slices.push({ img: images[i], srcY: overlapPx, height: imgHeight - overlapPx }); + } else { + slices.push({ img: images[i], srcY: 0, height: imgHeight }); + } + } + } + } else { + // No scroll data — stitch naively + for (let i = 0; i < images.length; i++) { + slices.push({ img: images[i], srcY: 0, height: imgHeight }); + } + } + + // Calculate total stitched height, capped at 16384px (canvas/API limit) + let maxCanvasHeight = 16384; + let totalHeight = 0; + for (let i = 0; i < slices.length; i++) { + if (totalHeight + slices[i].height > maxCanvasHeight) { + slices[i].height = maxCanvasHeight - totalHeight; + totalHeight = maxCanvasHeight; + slices.length = i + 1; + break; + } + totalHeight += slices[i].height; + } + + let canvas = document.createElement("canvas"); + canvas.width = totalWidth; + canvas.height = totalHeight; + let ctx = canvas.getContext("2d") as CanvasRenderingContext2D; + + let yOffset = 0; + for (let i = 0; i < slices.length; i++) { + let s = slices[i]; + ctx.drawImage(s.img, 0, s.srcY, totalWidth, s.height, 0, yOffset, totalWidth, s.height); + yOffset += s.height; + } + + canvas.toBlob(((blob: Blob) => { + resolve(blob); + }) as BlobCallback, "image/jpeg", 0.9); + }; + + for (let i = 0; i < dataUrls.length; i++) { + let img = new Image(); + img.onload = () => { + loaded++; + if (loaded === dataUrls.length) { + onAllLoaded(); + } + }; + img.onerror = () => { + reject(); + }; + images.push(img); + img.src = dataUrls[i]; + } + }); + } } diff --git a/src/scripts/contentCapture/readability.d.ts b/src/scripts/contentCapture/readability.d.ts new file mode 100644 index 00000000..54a6a896 --- /dev/null +++ b/src/scripts/contentCapture/readability.d.ts @@ -0,0 +1,30 @@ +declare module "@mozilla/readability" { + export class Readability { + constructor(doc: Document, options?: { + debug?: boolean; + maxElemsToParse?: number; + nbTopCandidates?: number; + charThreshold?: number; + classesToPreserve?: string[]; + keepClasses?: boolean; + }); + parse(): { + title: string; + content: string; + textContent: string; + length: number; + excerpt: string; + byline: string; + dir: string; + siteName: string; + lang: string; + publishedTime: string; + } | null; + } + + export function isProbablyReaderable(doc: Document, options?: { + minContentLength?: number; + minScore?: number; + visibilityChecker?: (node: Element) => boolean; + }): boolean; +} diff --git a/src/scripts/extensions/bookmarklet/inlineWorker.ts b/src/scripts/extensions/bookmarklet/inlineWorker.ts index df21aefe..0089b3c0 100644 --- a/src/scripts/extensions/bookmarklet/inlineWorker.ts +++ b/src/scripts/extensions/bookmarklet/inlineWorker.ts @@ -77,6 +77,10 @@ export class InlineWorker extends ExtensionWorkerBase { return this.throwNotImplementedFailure(); } + protected takeFullPageScreenshot(htmlContent: string): Promise { + return this.throwNotImplementedFailure(); + } + /** * Launches the sign in window, rejecting with an error object if something went wrong on the server during * authentication. Otherwise, it resolves with true if the redirect endpoint was hit as a result of a successful diff --git a/src/scripts/extensions/chrome/manifest.json b/src/scripts/extensions/chrome/manifest.json index 64a48fe0..b1d4168d 100644 --- a/src/scripts/extensions/chrome/manifest.json +++ b/src/scripts/extensions/chrome/manifest.json @@ -41,7 +41,8 @@ "tabs", "webRequest", "webNavigation", - "offscreen" + "offscreen", + "storage" ], "host_permissions": [ "" diff --git a/src/scripts/extensions/edge/manifest.json b/src/scripts/extensions/edge/manifest.json index 96123579..cd27b127 100644 --- a/src/scripts/extensions/edge/manifest.json +++ b/src/scripts/extensions/edge/manifest.json @@ -36,7 +36,8 @@ "tabs", "webRequest", "webNavigation", - "offscreen" + "offscreen", + "storage" ], "host_permissions": [ diff --git a/src/scripts/extensions/extensionWorkerBase.ts b/src/scripts/extensions/extensionWorkerBase.ts index 52cef741..9ec78321 100644 --- a/src/scripts/extensions/extensionWorkerBase.ts +++ b/src/scripts/extensions/extensionWorkerBase.ts @@ -192,6 +192,19 @@ export abstract class ExtensionWorkerBase { */ protected abstract takeTabScreenshot(): Promise; + /** + * Renders the given HTML in an offscreen context and captures full-page screenshots. + * Returns an array of data URL strings. + */ + protected abstract takeFullPageScreenshot(htmlContent: string): Promise; + + /** + * Cancels an in-progress full-page screenshot capture. + */ + protected cancelFullPageScreenshot(): void { + // Default no-op; overridden in WebExtensionWorker + } + /** * Closes all active frames and notifies the UI to invoke the clipper. */ @@ -584,6 +597,14 @@ export abstract class ExtensionWorkerBase { return this.takeTabScreenshot(); }); + this.uiCommunicator.registerFunction(Constants.FunctionKeys.takeFullPageScreenshot, () => { + return this.takeFullPageScreenshot(""); + }); + + this.uiCommunicator.registerFunction(Constants.FunctionKeys.cancelFullPageScreenshot, () => { + this.cancelFullPageScreenshot(); + }); + this.uiCommunicator.setErrorHandler((e: Error) => { Log.ErrorUtils.handleCommunicatorError(Constants.CommunicationChannels.extensionAndUi, e, this.clientInfo); }); diff --git a/src/scripts/extensions/safari/safariWorker.ts b/src/scripts/extensions/safari/safariWorker.ts index 4ef146aa..1e4c71a7 100644 --- a/src/scripts/extensions/safari/safariWorker.ts +++ b/src/scripts/extensions/safari/safariWorker.ts @@ -140,6 +140,14 @@ export class SafariWorker extends ExtensionWorkerBase { + return Promise.resolve({ success: false } as any); + } + /** * Launches a new tab and navigates to the given url. If autoCloseDestinationUrl is defined, then a * listener is set that will wait until the given URL is navigated to, the window is closed. diff --git a/src/scripts/extensions/webExtensionBase/webExtensionWorker.ts b/src/scripts/extensions/webExtensionBase/webExtensionWorker.ts index a6c09887..f4fb4e91 100644 --- a/src/scripts/extensions/webExtensionBase/webExtensionWorker.ts +++ b/src/scripts/extensions/webExtensionBase/webExtensionWorker.ts @@ -29,6 +29,7 @@ type Window = chrome.windows.Window; export class WebExtensionWorker extends ExtensionWorkerBase { private injectUrls: InjectUrls; private noOpTrackerInvoked: boolean; + private activeRendererCleanup: () => void; constructor(injectUrls: InjectUrls, tab: W3CTab, clientInfo: SmartValue, auth: AuthenticationHelper) { let messageHandlerThunk = () => { return new WebExtensionBackgroundMessageHandler(tab.id); }; @@ -39,6 +40,8 @@ export class WebExtensionWorker extends ExtensionWorkerBase { this.tabId = tab.id; this.noOpTrackerInvoked = false; + this.activeRendererCleanup = () => { /* no-op */ }; + let isPrivateWindow: Boolean = !!tab.incognito || !!tab.inPrivate; this.consoleOutputEnabledFlagProcessed.then(() => { @@ -203,6 +206,169 @@ export class WebExtensionWorker extends ExtensionWorkerBase { }); } + /** + * Cancels an in-progress full-page screenshot capture. + */ + protected cancelFullPageScreenshot(): void { + this.activeRendererCleanup(); + } + + /** + * Renders HTML in a temporary popup window and captures full-page screenshots. + * Mirrors the server-side DomEnhancer approach: opens an extension renderer page, + * communicates via chrome.runtime port, then scroll-captures it. + */ + protected takeFullPageScreenshot(htmlContent: string): Promise { + let rendererUrl = WebExtension.browser.runtime.getURL("renderer.html"); + + return new Promise((resolve) => { + let dataUrls: string[] = []; + let renderWindowId: number; + let pendingPort: chrome.runtime.Port; + let windowReady = false; + + // Position renderer directly behind the user's window to hide it + WebExtension.browser.windows.getCurrent((currentWindow: chrome.windows.Window) => { + let renderWidth = Math.min(currentWindow ? currentWindow.width : 1280, 1280); + let renderHeight = currentWindow ? currentWindow.height : 768; + let renderLeft = currentWindow ? currentWindow.left : 0; + let renderTop = currentWindow ? currentWindow.top : 0; + + let startCapture = (port: chrome.runtime.Port) => { + let viewportHeight: number; + let captureCount = 0; + let scrollPositions: number[] = []; + + let cleaned = false; + let cleanup = () => { + if (cleaned) { return; } + cleaned = true; + this.activeRendererCleanup = () => { /* no-op */ }; + try { port.disconnect(); } catch (e) { /* ignore */ } + WebExtension.browser.windows.remove(renderWindowId); + chrome.storage.session.remove([ + "fullPageHtmlContent", "fullPageBaseUrl", "fullPageStatusText", + "fullPageScreenshots", "fullPageScrollData" + ]); + }; + this.activeRendererCleanup = cleanup; + + port.onMessage.addListener((message: any) => { + if (message.action === "ready") { + // Set zoom to 100% before loading content, then load + let rendererTabId: number; + try { + WebExtension.browser.tabs.query({ windowId: renderWindowId }, (tabs: chrome.tabs.Tab[]) => { + if (tabs && tabs.length > 0 && tabs[0].id) { + rendererTabId = tabs[0].id; + WebExtension.browser.tabs.setZoom(rendererTabId, 1, () => { + port.postMessage({ action: "loadContent" }); + }); + } else { + port.postMessage({ action: "loadContent" }); + } + }); + } catch (e) { + port.postMessage({ action: "loadContent" }); + } + } + + if (message.action === "dimensions") { + viewportHeight = message.viewportHeight; + + if (!viewportHeight) { + cleanup(); + resolve([]); + return; + } + + port.postMessage({ action: "scroll", scrollTo: 0 }); + } + + if (message.action === "scrollResult") { + setTimeout(() => { + WebExtension.browser.tabs.captureVisibleTab(renderWindowId, { format: "jpeg", quality: 95 }, (dataUrl: string) => { + if (!dataUrl) { + // Capture failed (window occluded/unfocused) — abort and fail + cleanup(); + resolve({ success: false } as any); + return; + } + + dataUrls.push(dataUrl); + scrollPositions.push(message.scrollY); + captureCount++; + + // Stop at bottom or when captures would exceed canvas height limit (16384px) + let maxCaptureHeight = 16384; + let atBottom = message.scrollY + viewportHeight >= message.pageHeight + || (captureCount * viewportHeight) >= maxCaptureHeight; + + if (atBottom) { + cleanup(); + chrome.storage.session.set({ + fullPageScreenshots: dataUrls, + fullPageScrollData: { + scrollPositions: scrollPositions, + viewportHeight: viewportHeight + } + }, () => { + resolve({ success: true, count: dataUrls.length, format: "jpeg", cssWidth: renderWidth } as any); + }); + } else { + port.postMessage({ action: "scroll", scrollTo: captureCount * viewportHeight }); + } + }); + }, 500); + } + }); + }; + + // Listen for the renderer page to connect via port + let onConnect = (port: chrome.runtime.Port) => { + if (port.name !== "renderer") { + return; + } + WebExtension.browser.runtime.onConnect.removeListener(onConnect); + + if (windowReady) { + startCapture(port); + } else { + // Window.create callback hasn't fired yet, defer + pendingPort = port; + } + }; + + WebExtension.browser.runtime.onConnect.addListener(onConnect); + + // Create the renderer window. Must be focused so Chrome paints it + // for captureVisibleTab. It auto-closes when capture completes. + WebExtension.browser.windows.create({ + url: rendererUrl, + type: "popup", + width: renderWidth, + height: renderHeight, + left: renderLeft, + top: renderTop, + focused: true + }, (renderWindow: chrome.windows.Window) => { + if (!renderWindow) { + WebExtension.browser.runtime.onConnect.removeListener(onConnect); + resolve([]); + return; + } + renderWindowId = renderWindow.id; + windowReady = true; + + // If port connected before window.create callback, start now + if (pendingPort) { + startCapture(pendingPort); + } + }); + }); // end getCurrent + }); + } + private launchWebExtensionPopupAndWaitForClose(url: string, autoCloseDestinationUrl: string): Promise { return new Promise((resolve, reject) => { let popupWidth = 1000; diff --git a/src/scripts/renderer.ts b/src/scripts/renderer.ts new file mode 100644 index 00000000..ececf24b --- /dev/null +++ b/src/scripts/renderer.ts @@ -0,0 +1,211 @@ +// Renderer page script - connects to service worker via port +// and handles scroll/capture commands. Reads HTML directly from +// chrome.storage.session to avoid large data through message channels. +let port = chrome.runtime.connect({ name: "renderer" }); + +// Set title bar as status indicator — no overlay needed +chrome.storage.session.get(["fullPageStatusText"], (stored: any) => { + document.title = stored && stored.fullPageStatusText ? stored.fullPageStatusText : "Capturing page..."; +}); + +port.onMessage.addListener((message: any) => { + if (message.action === "loadContent") { + // Read HTML and base URL directly from session storage + chrome.storage.session.get(["fullPageHtmlContent", "fullPageBaseUrl"], (stored: any) => { + let rawHtml = stored && stored.fullPageHtmlContent ? stored.fullPageHtmlContent : ""; + let baseUrl = stored && stored.fullPageBaseUrl ? stored.fullPageBaseUrl : ""; + + // Strip scripts only — keep iframes (some sites use them for content) + let cleanHtml = rawHtml + .replace(//gi, "") + .replace(/]*\/>/gi, ""); + + // Parse into DOM to rewrite URLs and extract content + let parser = new DOMParser(); + let doc = parser.parseFromString(cleanHtml, "text/html"); + + // Rewrite relative URLs to absolute (CSP blocks on extension pages) + if (baseUrl) { + let resolveUrl = (relative: string): string => { + try { + return new URL(relative, baseUrl).href; + } catch (e) { + return relative; + } + }; + + // Images + let imgs = doc.querySelectorAll("img[src]"); + for (let i = 0; i < imgs.length; i++) { + let src = imgs[i].getAttribute("src"); + if (src && src.indexOf("data:") !== 0 && src.indexOf("blob:") !== 0) { + imgs[i].setAttribute("src", resolveUrl(src)); + } + } + + // Stylesheets + let links = doc.querySelectorAll('link[rel="stylesheet"][href]'); + for (let i = 0; i < links.length; i++) { + let href = links[i].getAttribute("href"); + if (href) { + links[i].setAttribute("href", resolveUrl(href)); + } + } + + // Srcset attributes + let srcsets = doc.querySelectorAll("[srcset]"); + for (let i = 0; i < srcsets.length; i++) { + let srcset = srcsets[i].getAttribute("srcset"); + if (srcset) { + let resolved = srcset.replace(/(\S+)(\s+\S+)?/g, (match, url, descriptor) => { + if (url && url.indexOf("data:") !== 0 && url.indexOf("blob:") !== 0) { + return resolveUrl(url) + (descriptor || ""); + } + return match; + }); + srcsets[i].setAttribute("srcset", resolved); + } + } + + // CSS url() in inline styles (background-image, content, etc.) + let styledElements = doc.querySelectorAll("[style]"); + for (let i = 0; i < styledElements.length; i++) { + let style = styledElements[i].getAttribute("style"); + if (style && style.indexOf("url(") !== -1) { + let rewritten = style.replace(/url\(\s*(['"]?)([^)'"]+)\1\s*\)/g, (match, quote, urlVal) => { + if (urlVal && urlVal.indexOf("data:") !== 0 && urlVal.indexOf("blob:") !== 0 && urlVal.indexOf("#") !== 0) { + return "url(" + quote + resolveUrl(urlVal) + quote + ")"; + } + return match; + }); + styledElements[i].setAttribute("style", rewritten); + } + } + + // CSS url() in + diff --git a/src/scripts/clipperUI/clipper.tsx b/src/scripts/clipperUI/clipper.tsx index 320279be..2bf277c4 100644 --- a/src/scripts/clipperUI/clipper.tsx +++ b/src/scripts/clipperUI/clipper.tsx @@ -246,7 +246,7 @@ class ClipperClass extends ComponentBase { } else { this.state.setState({ fullPageResult: { status: Status.InProgress } }); - FullPageScreenshotHelper.getFullPageScreenshot(this.state.pageInfo.contentData, this.state.pageInfo.rawUrl).then((result) => { + FullPageScreenshotHelper.getFullPageScreenshot(this.state.pageInfo.contentData, this.state.pageInfo.rawUrl, this.state.pageInfo.stylesheetCache).then((result) => { this.state.setState({ fullPageResult: { data: result, status: Status.Succeeded } }); }, () => { this.state.setState({ diff --git a/src/scripts/contentCapture/fullPageScreenshotHelper.ts b/src/scripts/contentCapture/fullPageScreenshotHelper.ts index db1bc1b6..40fdf065 100644 --- a/src/scripts/contentCapture/fullPageScreenshotHelper.ts +++ b/src/scripts/contentCapture/fullPageScreenshotHelper.ts @@ -20,7 +20,7 @@ interface ScrollData { } export class FullPageScreenshotHelper { - public static getFullPageScreenshot(pageInfoContentData: string, pageUrl?: string): Promise { + public static getFullPageScreenshot(pageInfoContentData: string, pageUrl?: string, stylesheetCache?: { [url: string]: { cssText: string; media: string } }): Promise { return new Promise((resolve, reject) => { let fullPageScreenshotEvent = new Log.Event.PromiseEvent(Log.Event.Label.FullPageScreenshotCall); let correlationId = StringUtils.generateGuid(); @@ -31,6 +31,9 @@ export class FullPageScreenshotHelper { if (pageUrl) { storageData.fullPageBaseUrl = pageUrl; } + if (stylesheetCache) { + storageData.fullPageStylesheets = stylesheetCache; + } chrome.storage.session.set(storageData, () => { Clipper.getExtensionCommunicator().callRemoteFunction( @@ -49,7 +52,7 @@ export class FullPageScreenshotHelper { chrome.storage.session.remove([ "fullPageHtmlContent", "fullPageBaseUrl", "fullPageStatusText", - "fullPageScreenshots", "fullPageScrollData" + "fullPageScreenshots", "fullPageScrollData", "fullPageStylesheets" ]); if (dataUrls.length > 0) { diff --git a/src/scripts/domParsers/domUtils.ts b/src/scripts/domParsers/domUtils.ts index 7856be96..c3e984e6 100644 --- a/src/scripts/domParsers/domUtils.ts +++ b/src/scripts/domParsers/domUtils.ts @@ -336,6 +336,8 @@ export class DomUtils { */ public static getCleanDomOfCurrentPage(originalDoc: Document): string { let doc = DomUtils.cloneDocument(originalDoc); + DomUtils.inlineHiddenElements(doc, originalDoc); + DomUtils.flattenShadowDomSlots(doc, originalDoc); DomUtils.convertCanvasElementsToImages(doc, originalDoc); DomUtils.addBaseTagIfNecessary(doc, originalDoc.location); @@ -467,6 +469,89 @@ export class DomUtils { return container.insertBefore(spacerNode, referenceNode); } + /** + * Handle elements that were inside web components with shadow DOM. + * cloneNode(true) does NOT clone declarative shadow roots, so slotted content + * (e.g., dropdown panels with slot="dropdown") becomes visible as regular DOM. + * For elements whose shadow-hosted parent hid them via slot CSS, we check the + * original document's computed visibility and inline display:none if hidden. + * + * Also removes