Skip to content

Commit 26825fc

Browse files
committed
bugfix: using onlyIncludeTags and removeTags together
1 parent a532232 commit 26825fc

File tree

2 files changed

+27
-9
lines changed

2 files changed

+27
-9
lines changed

apps/api/src/scraper/WebScraper/utils/__tests__/removeUnwantedElements.test.ts

Lines changed: 17 additions & 0 deletions
Large diffs are not rendered by default.

apps/api/src/scraper/WebScraper/utils/removeUnwantedElements.ts

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,30 +1,31 @@
1-
import cheerio, { AnyNode, Cheerio } from "cheerio";
1+
import { AnyNode, Cheerio, load } from "cheerio";
22
import { PageOptions } from "../../../lib/entities";
33
import { excludeNonMainTags } from "./excludeTags";
44

55
export const removeUnwantedElements = (
66
html: string,
7-
pageOptions: PageOptions
7+
pageOptions: PageOptions,
88
) => {
9-
const soup = cheerio.load(html);
9+
let soup = load(html);
1010

1111
if (
1212
pageOptions.onlyIncludeTags &&
1313
pageOptions.onlyIncludeTags.length > 0 &&
14-
pageOptions.onlyIncludeTags[0] !== ''
14+
pageOptions.onlyIncludeTags[0] !== ""
1515
) {
1616
if (typeof pageOptions.onlyIncludeTags === "string") {
1717
pageOptions.onlyIncludeTags = [pageOptions.onlyIncludeTags];
1818
}
1919
if (pageOptions.onlyIncludeTags.length !== 0) {
2020
// Create a new root element to hold the tags to keep
21-
const newRoot = cheerio.load("<div></div>")("div");
21+
const newRoot = load("<div></div>")("div");
2222
pageOptions.onlyIncludeTags.forEach((tag) => {
2323
soup(tag).each((index, element) => {
2424
newRoot.append(soup(element).clone());
2525
});
2626
});
27-
return newRoot.html();
27+
28+
soup = load(newRoot.html());
2829
}
2930
}
3031

@@ -33,7 +34,7 @@ export const removeUnwantedElements = (
3334
if (
3435
pageOptions.removeTags &&
3536
pageOptions.removeTags.length > 0 &&
36-
pageOptions.removeTags[0] !== ''
37+
pageOptions.removeTags[0] !== ""
3738
) {
3839
if (typeof pageOptions.removeTags === "string") {
3940
pageOptions.removeTags = [pageOptions.removeTags];
@@ -51,11 +52,11 @@ export const removeUnwantedElements = (
5152
const attributes = element.attribs;
5253
const tagNameMatches = regexPattern.test(element.name);
5354
const attributesMatch = Object.keys(attributes).some((attr) =>
54-
regexPattern.test(`${attr}="${attributes[attr]}"`)
55+
regexPattern.test(`${attr}="${attributes[attr]}"`),
5556
);
5657
if (tag.startsWith("*.")) {
5758
classMatch = Object.keys(attributes).some((attr) =>
58-
regexPattern.test(`class="${attributes[attr]}"`)
59+
regexPattern.test(`class="${attributes[attr]}"`),
5960
);
6061
}
6162
return tagNameMatches || attributesMatch || classMatch;

0 commit comments

Comments
 (0)