TrendSeeker tracks real-time trends, automates Twitter updates, allows customizable analysis, and provides a web app with a chatbot. Future plans include token trading and DeFi actions. It's open-source for community collaboration. For additional info, check out our [GitHub].
=====================================================
from requests_html import HTMLSession, HTML
from lxml.etree import ParserError
session = HTMLSession()
class Profile:
"""
Parse twitter profile and split informations into class as attribute.
Attributes:
- name
- username
- birthday
- location
- biography
- website
- profile_photo
- banner_photo
- likes_count
- tweets_count
- followers_count
- following_count
- is_verified
- is_private
- user_id
"""
def __init__(self, username):
headers = {
"Accept": "application/json, text/javascript, */*; q=0.01",
"Referer": f"https://twitter.com/{username}",
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/603.3.8 (KHTML, like Gecko) Version/10.1.2 Safari/603.3.8",
"X-Twitter-Active-User": "yes",
"X-Requested-With": "XMLHttpRequest",
"Accept-Language": "en-US",
}
page = session.get(f"https://twitter.com/{username}", headers=headers)
self.username = username
self.__parse_profile(page)
def __parse_profile(self, page):
try:
html = HTML(html=page.text, url="bunk", default_encoding="utf-8")
except KeyError:
raise ValueError(
f'Oops! Either "{self.username}" does not exist or is private.'
)
except ParserError:
pass
try:
self.is_private = html.find(".ProfileHeaderCard-badges .Icon--protected")[0]
self.is_private = True
except:
self.is_private = False
try:
self.is_verified = html.find(".ProfileHeaderCard-badges .Icon--verified")[0]
self.is_verified = True
except:
self.is_verified = False
self.location = html.find(".ProfileHeaderCard-locationText")[0].text
if not self.location:
self.location = None
self.birthday = html.find(".ProfileHeaderCard-birthdateText")[0].text
if self.birthday:
self.birthday = self.birthday.replace("Born ", "")
else:
self.birthday = None
self.profile_photo = html.find(".ProfileAvatar-image")[0].attrs["src"]
try:
self.banner_photo = html.find(".ProfileCanopy-headerBg img")[0].attrs["src"]
except KeyError:
self.banner_photo = None
page_title = html.find("title")[0].text
self.name = page_title[: page_title.find("(")].strip()
self.user_id = html.find(".ProfileNav")[0].attrs["data-user-id"]
self.biography = html.find(".ProfileHeaderCard-bio")[0].text
if not self.birthday:
self.birthday = None
self.website = html.find(".ProfileHeaderCard-urlText")[0].text
if not self.website:
self.website = None
# get total tweets count if available
try:
q = html.find('li[class*="--tweets"] span[data-count]')[0].attrs["data-count"]
self.tweets_count = int(q)
except:
self.tweets_count = None
# get total following count if available
try:
q = html.find('li[class*="--following"] span[data-count]')[0].attrs["data-count"]
self.following_count = int(q)
except:
self.following_count = None
# get total follower count if available
try:
q = html.find('li[class*="--followers"] span[data-count]')[0].attrs["data-count"]
self.followers_count = int(q)
except:
self.followers_count = None
# get total like count if available
try:
q = html.find('li[class*="--favorites"] span[data-count]')[0].attrs["data-count"]
self.likes_count = int(q)
except:
self.likes_count = None
def to_dict(self):
return dict(
name=self.name,
username=self.username,
birthday=self.birthday,
biography=self.biography,
location=self.location,
website=self.website,
profile_photo=self.profile_photo,
banner_photo=self.banner_photo,
likes_count=self.likes_count,
tweets_count=self.tweets_count,
followers_count=self.followers_count,
following_count=self.following_count,
is_verified=self.is_verified,
is_private=self.is_private,
user_id=self.user_id
)
def __dir__(self):
return [
"name",
"username",
"birthday",
"location",
"biography",
"website",
"profile_photo",
'banner_photo'
"likes_count",
"tweets_count",
"followers_count",
"following_count",
"is_verified",
"is_private",
"user_id"
]
def __repr__(self):
return f"<profile {self.username}@twitter>"
session = HTMLSession()
def get_trends(): trends = []
headers = { "X-Twitter-Active-User": "yes", "X-Requested-With": "XMLHttpRequest", "Accept-Language": "en-US", }
html = session.get("https://twitter.com/i/trends", headers=headers) html = html.json()["module_html"]
html = HTML(html=html, url="bunk", default_encoding="utf-8")
for trend_item in html.find("li"): trend_text = trend_item.attrs["data-trend-name"]
trends.append(trend_text)
return trends
import refrom requests_html import HTMLSession, HTMLfrom datetime import datetimefrom urllib.parse import quotefrom lxml.etree import ParserError
session = HTMLSession()
def get_tweets(query, pages=25): """Gets tweets for a given user, via the Twitter frontend API."""
after_part = ( f"include_available_features=1&include_entities=1&include_new_items_bar=true" ) if query.startswith("#"): query = quote(query) url = f"https://x.com/i/search/timeline?f=tweets&vertical=default&q={query}&src=tyah&reset_error_state=false&" else: url = f"https://x.com/i/profiles/show/{query}/timeline/tweets?" url += after_part
headers = { "Accept": "application/json, text/javascript, */*; q=0.01", "Referer": f"https://x.com/{query}", "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/603.3.8 (KHTML, like Gecko) Version/10.1.2 Safari/603.3.8", "X-Twitter-Active-User": "yes", "X-Requested-With": "XMLHttpRequest", "Accept-Language": "en-US", }
def gen_tweets(pages): request = session.get(url + "&max_position", headers=headers)
while pages > 0: try: json_response = request.json() html = HTML( html=json_response["items_html"], url="bunk", default_encoding="utf-8", ) except KeyError: raise ValueError( f'Oops! Either "{query}" does not exist or is private.' ) except ParserError: break
comma = "," dot = "." tweets = [] for tweet, profile in zip( html.find(".stream-item"), html.find(".js-profile-popup-actionable") ): # 10~11 html elements have `.stream-item` class and also their `data-item-type` is `tweet` # but their content doesn't look like a tweet's content try: text = tweet.find(".tweet-text")[0].full_text except IndexError: # issue #50 continue
tweet_id = tweet.attrs["data-item-id"]
tweet_url = profile.attrs["data-permalink-path"]
username = profile.attrs["data-screen-name"]
user_id = profile.attrs["data-user-id"]
is_pinned = bool(tweet.find("div.pinned"))
time = datetime.fromtimestamp( int(tweet.find("._timestamp")[0].attrs["data-time-ms"]) / 1000.0 )
interactions = [x.text for x in tweet.find(".ProfileTweet-actionCount")]
replies = int( interactions[0].split(" ")[0].replace(comma, "").replace(dot, "") or interactions[3] )
retweets = int( interactions[1].split(" ")[0].replace(comma, "").replace(dot, "") or interactions[4] or interactions[5] )
likes = int( interactions[2].split(" ")[0].replace(comma, "").replace(dot, "") or interactions[6] or interactions[7] )
hashtags = [ hashtag_node.full_text for hashtag_node in tweet.find(".x-hashtag") ]
urls = [ url_node.attrs["data-expanded-url"] for url_node in ( tweet.find("a.x-timeline-link:not(.u-hidden)") + tweet.find( "[class='js-tweet-text-container'] a[data-expanded-url]" ) ) ] urls = list(set(urls)) # delete duplicated elements
photos = [ photo_node.attrs["data-image-url"] for photo_node in tweet.find(".AdaptiveMedia-photoContainer") ]
is_retweet = ( True if tweet.find(".js-stream-tweet")[0].attrs.get( "data-retweet-id", None ) else False )
videos = [] video_nodes = tweet.find(".PlayableMedia-player") for node in video_nodes: styles = node.attrs["style"].split() for style in styles: if style.startswith("background"): tmp = style.split("/")[-1] video_id = ( tmp[: tmp.index(".jpg")] if ".jpg" in tmp else tmp[: tmp.index(".png")] if ".png" in tmp else None ) videos.append({"id": video_id})
tweets.append( { "tweetId": tweet_id, "tweetUrl": tweet_url, "username": username, "userId": user_id, "isRetweet": is_retweet, "isPinned": is_pinned, "time": time, "text": text, "replies": replies, "retweets": retweets, "likes": likes, "entries": { "hashtags": hashtags, "urls": urls, "photos": photos, "videos": videos, }, } )
last_tweet = html.find(".stream-item")[-1].attrs["data-item-id"]
for tweet in tweets: tweet["text"] = re.sub(r"(\S)http", "\g<1> http", tweet["text"], 1) tweet["text"] = re.sub(r"(\S)pic\.x", "\g<1> pic.x", tweet["text"], 1) yield tweet
request = session.get( url, params={"max_position": json_response["min_position"]}, headers=headers, ) pages += -1
yield from gen_tweets(pages)
# for searching:## https://x.com/i/search/timeline?vertical=default&q=foof&src=typd&composed_count=0&include_available_features=1&include_entities=1&include_new_items_bar=true&interval=30000&latent_count=0# replace 'foof' with your query string. Not sure how to decode yet but it seems to work.
export = {
scrape: [
'user',
'hashtag',
'trend',
'music',
'discover_user',
'discover_hashtag',
'discover_music',
'history',
'video',
'from-file',
'userprofile',
],
chronologicalTypes: ['user'],
history: ['user', 'hashtag', 'trend', 'music'],
requiredSession: ['user', 'hashtag', 'trend', 'music'],
sourceType: {
user: 8,
music: 11,
trend: 12,
},
/**
* verifyFp is used to bypass captcha
* Currently this method is with hardcoded values
* later I or someone else will implement proper way to generate valid value
*/
verifyFp: () => {
const variants = [];
return variants[Math.floor(Math.random() * variants.length)];
},
/**
* Generate random user-agent with randon versions(fake)
*/
userAgent: () => {
const os = [
'Macintosh; Intel Mac OS X 10_15_7',
'Macintosh; Intel Mac OS X 10_15_5',
'Macintosh; Intel Mac OS X 10_11_6',
'Macintosh; Intel Mac OS X 10_6_6',
'Macintosh; Intel Mac OS X 10_9_5',
'Macintosh; Intel Mac OS X 10_10_5',
'Macintosh; Intel Mac OS X 10_7_5',
'Macintosh; Intel Mac OS X 10_11_3',
'Macintosh; Intel Mac OS X 10_10_3',
'Macintosh; Intel Mac OS X 10_6_8',
'Macintosh; Intel Mac OS X 10_10_2',
'Macintosh; Intel Mac OS X 10_10_3',
'Macintosh; Intel Mac OS X 10_11_5',
'Windows NT 10.0; Win64; x64',
'Windows NT 10.0; WOW64',
'Windows NT 10.0',
];
return `Mozilla/5.0 (${os[Math.floor(Math.random() * os.length)]}) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/${Math.floor(
Math.random() * 3,
) + 87}.0.${Math.floor(Math.random() * 190) + 4100}.${Math.floor(Math.random() * 50) + 140} Safari/537.36`;
},
};
/* eslint-disable class-methods-use-this */
/* eslint-disable no-param-reassign */
/* eslint-disable consistent-return */
/* eslint-disable no-console */
import request, { OptionsWithUri, CookieJar } from 'request';
import rp from 'request-promise';
import { Agent } from 'http';
import { createWriteStream, writeFile } from 'fs';
import { fromCallback } from 'bluebird';
import archiver from 'archiver';
import { SocksProxyAgent } from 'socks-proxy-agent';
import { forEachLimit } from 'async';
import { MultipleBar } from '../helpers';
import { DownloaderConstructor, PostCollector, DownloadParams, Proxy, Headers } from '../types';
export class Downloader {
public progress: boolean;
public mbars: MultipleBar;
public progressBar: any[];
private proxy: string[] | string;
public noWaterMark: boolean;
public filepath: string;
public bulk: boolean;
public headers: Headers;
public cookieJar: CookieJar;
constructor({ progress, proxy, noWaterMark, headers, filepath, bulk, cookieJar }: DownloaderConstructor) {
this.progress = true || progress;
this.progressBar = [];
this.noWaterMark = noWaterMark;
this.headers = headers;
this.filepath = filepath;
this.mbars = new MultipleBar();
this.proxy = proxy;
this.bulk = bulk;
this.cookieJar = cookieJar;
}
/**
* Get proxy
*/
private get getProxy(): Proxy {
if (Array.isArray(this.proxy)) {
const selectProxy = this.proxy.length ? this.proxy[Math.floor(Math.random() * this.proxy.length)] : '';
return {
socks: false,
proxy: selectProxy,
};
}
if (this.proxy.indexOf('socks4://') > -1 || this.proxy.indexOf('socks5://') > -1) {
return {
socks: true,
proxy: new SocksProxyAgent(this.proxy as string),
};
}
return {
socks: false,
proxy: this.proxy as string,
};
}
/**
* Add new bar to indicate download progress
* @param {number} len
*/
public addBar(type: boolean, len: number): any[] {
this.progressBar.push(
this.mbars.newBar(`Downloading (${!type ? 'WITH WM' : 'WITHOUT WM'}) :id [:bar] :percent`, {
complete: '=',
incomplete: ' ',
width: 30,
total: len,
}),
);
return this.progressBar[this.progressBar.length - 1];
}
/**
* Convert video file to the buffer
* @param {*} item
*/
public toBuffer(item: PostCollector): Promise<Buffer> {
return new Promise((resolve, reject) => {
const proxy = this.getProxy;
let r = request;
let barIndex;
let buffer = Buffer.from('');
if (proxy.proxy && !proxy.socks) {
r = request.defaults({ proxy: `http://${proxy.proxy}/` });
}
if (proxy.proxy && proxy.socks) {
r = request.defaults({ agent: (proxy.proxy as unknown) as Agent });
}
r.get({
url: item.videoUrlNoWaterMark ? item.videoUrlNoWaterMark : item.videoUrl,
headers: this.headers,
jar: this.cookieJar,
})
.on('response', response => {
const len = parseInt(response.headers['content-length'] as string, 10);
if (this.progress && !this.bulk && len) {
barIndex = this.addBar(!!item.videoUrlNoWaterMark, len);
}
if (this.progress && !this.bulk && !len) {
console.log(`Empty response! You can try again with a proxy! Can't download video: ${item.id}`);
}
})
.on('data', chunk => {
if (chunk.length) {
buffer = Buffer.concat([buffer, chunk as Buffer]);
if (this.progress && !this.bulk && barIndex && barIndex.hasOwnProperty('tick')) {
barIndex.tick(chunk.length, { id: item.id });
}
}
})
.on('end', () => {
resolve(buffer);
})
.on('error', () => {
reject(new Error(`Cant download video: ${item.id}. If you were using proxy, please try without it.`));
});
});
}
/**
* Download posts
* if {zip} is true then zip the result else save posts to the {folder}
*/
public downloadPosts({ zip, folder, collector, fileName, asyncDownload }: DownloadParams) {
return new Promise((resolve, reject) => {
const saveDestination = zip ? `${fileName}.zip` : folder;
const archive = archiver('zip', {
gzip: true,
zlib: { level: 9 },
});
if (zip) {
const output = createWriteStream(saveDestination);
archive.pipe(output);
}
forEachLimit(
collector,
asyncDownload,
(item: PostCollector, cb) => {
this.toBuffer(item)
.then(async buffer => {
if (buffer.length) {
item.downloaded = true;
if (zip) {
archive.append(buffer, { name: `${item.id}.mp4` });
} else {
await fromCallback(cback => writeFile(`${saveDestination}/${item.id}.mp4`, buffer, cback));
}
} else {
item.downloaded = false;
}
cb(null);
})
.catch(() => {
item.downloaded = false;
cb(null);
});
},
error => {
if (error) {
return reject(error);
}
if (zip) {
archive.finalize();
archive.on('end', () => resolve(''));
} else {
resolve('');
}
},
);
});
}
/**
* Download single video without the watermark
* @param post
*/
public async downloadSingleVideo(post: PostCollector) {
const proxy = this.getProxy;
let url = post.videoUrlNoWaterMark;
if (!url) {
url = post.videoUrl;
}
const options = ({
uri: url,
method: 'GET',
jar: this.cookieJar,
headers: this.headers,
encoding: null,
...(proxy.proxy && proxy.socks ? { agent: proxy.proxy } : {}),
...(proxy.proxy && !proxy.socks ? { proxy: `http://${proxy.proxy}/` } : {}),
} as unknown) as OptionsWithUri;
const result = await rp(options);
await fromCallback(cb => writeFile(`${this.filepath}/${post.id}.mp4`, result, cb));
}
}
import fs from 'fs';
import { ScrapeType, Result, RequestQuery, UserMetadata, PostCollector, HashtagMetadata } from '../types';
import { TikTokScraper } from './TikTok';
import CONST from '../constant';
jest.mock('request-promise-native');
jest.mock('request-promise');
describe('TikTok Scraper MODULE(promise): user(valid input data)', () => {
let instance;
beforeAll(() => {
instance = new TikTokScraper({
download: false,
asyncDownload: 5,
asyncScraping: 3,
filetype: '',
filepath: '',
input: 'tiktok',
noWaterMark: false,
type: 'user',
headers: {
'user-agent': 'Custom user-agent',
},
proxy: '',
number: 5,
});
});
it('user input should not be empty', async () => {
expect(instance).toBeInstanceOf(TikTokScraper);
expect(instance.input).toContain('tiktok');
});
it('set custom user-agent', async () => {
expect(instance).toBeInstanceOf(TikTokScraper);
expect(instance.headers['user-agent']).toContain('Custom user-agent');
});
it('getUserId should return a valid Object', async () => {
const userId: RequestQuery = await instance.getUserId();
expect(userId).toEqual({
count: 30,
id: '107955',
lang: '',
maxCursor: 0,
minCursor: 0,
secUid: '',
sourceType: 8,
verifyFp: '',
});
});
it('result should contain array value with the length 5', async () => {
const posts: Result = await instance.scrape();
expect(posts.collector.length).toEqual(5);
});
});
describe('TikTok Scraper MODULE(event): user(valid input data)', () => {
let instance;
beforeAll(() => {
instance = new TikTokScraper({
download: false,
asyncDownload: 5,
asyncScraping: 5,
filetype: '',
filepath: '',
input: 'tiktok',
type: 'user',
headers: {
'user-agent': 'Custom user-agent',
},
proxy: '',
number: 1,
event: true,
});
});
it('result should emit "done" event if task was completed', done => {
instance.on('done', data => {
expect(data).toEqual('completed');
done();
});
instance.scrape();
});
});
describe('TikTok Scraper MODULE(promise): user(invalid input data)', () => {
it('Throw error if username is empty', () => {
const instance = new TikTokScraper({
download: false,
asyncDownload: 5,
asyncScraping: 5,
filetype: '',
filepath: '',
input: '',
type: 'user',
headers: {
'user-agent': 'okhttp',
},
proxy: '',
number: 5,
});
expect(instance.scrape()).rejects.toEqual('Missing input');
});
it('Throw error if wrong scraping type was provided', () => {
const instance = new TikTokScraper({
download: false,
asyncDownload: 5,
asyncScraping: 5,
filetype: '',
filepath: '',
input: '',
type: 'fake' as ScrapeType,
headers: {
'user-agent': 'okhttp',
},
proxy: '',
number: 5,
});
expect(instance.scrape()).rejects.toEqual(`Missing scraping type. Scrape types: ${CONST.scrape} `);
});
});
describe('TikTok Scraper MODULE(event): user(invalid input data)', () => {
it('Throw error if username is empty', done => {
const instance = new TikTokScraper({
download: false,
asyncDownload: 5,
asyncScraping: 5,
filetype: '',
filepath: '',
input: '',
type: 'user',
headers: {
'user-agent': 'okhttp',
},
proxy: '',
number: 1,
event: true,
});
instance.on('error', data => {
expect(data).toEqual('Missing input');
done();
});
instance.scrape();
});
it('Throw error if wrong scraping type was provided', done => {
const instance = new TikTokScraper({
download: false,
asyncDownload: 5,
asyncScraping: 5,
filetype: '',
filepath: '',
input: '',
type: 'fake' as ScrapeType,
headers: {
'user-agent': 'okhttp',
},
proxy: '',
number: 5,
event: true,
});
instance.on('error', data => {
expect(data).toEqual(`Missing scraping type. Scrape types: ${CONST.scrape} `);
done();
});
instance.scrape();
});
});
describe('TikTok Scraper MODULE(promise): user(save to a file)', () => {
let instance;
let posts: Result;
beforeAll(async () => {
jest.spyOn(fs, 'writeFile').mockImplementation((file, option, cb) => cb(null));
instance = new TikTokScraper({
download: false,
asyncDownload: 5,
asyncScraping: 5,
filetype: 'all',
filepath: '',
input: 'tiktok',
type: 'user',
headers: {
'user-agent': 'okhttp',
},
proxy: '',
number: 5,
});
posts = await instance.scrape();
});
afterAll(() => {
jest.restoreAllMocks();
});
it('fs.WriteFile should be called 2 times. Save to a csv and json', async () => {
expect(fs.writeFile).toHaveBeenCalledTimes(2);
});
it('result should contain a valid file names for the csv and json files', async () => {
expect(posts.csv).toMatch(/^(\w+)_([0-9]{13}).csv$/);
expect(posts.json).toMatch(/^(\w+)_([0-9]{13}).json$/);
});
});
describe('TikTok Scraper MODULE(promise): hashtag(valid input data)', () => {
let instance;
beforeAll(() => {
instance = new TikTokScraper({
download: false,
asyncDownload: 5,
asyncScraping: 5,
filetype: '',
filepath: '',
input: 'summer',
type: 'hashtag',
headers: {
'user-agent': 'okhttp',
},
proxy: '',
number: 5,
});
});
it('hashtag input should not be empty', async () => {
expect(instance).toBeInstanceOf(TikTokScraper);
expect(instance.input).toContain('summer');
});
it('getHashTagId should return a valid Object', async () => {
const hashtag: RequestQuery = await instance.getHashTagId();
expect(hashtag).toEqual({ aid: 1988, challengeID: '99770', count: 30, cursor: 0, user_agent: 'okhttp', verifyFp: '' });
});
// it('result should contain array value with the length 5', async () => {
// const posts: Result = await instance.scrape();
// expect(posts.collector.length).toEqual(5);
// });
});
describe('TikTok Scraper MODULE(promise): signUrl', () => {
let instance;
beforeAll(() => {
instance = new TikTokScraper({
download: false,
asyncDownload: 5,
asyncScraping: 5,
filetype: '',
filepath: '',
input: 'https://m.tiktok.com/share/item/list?secUid=&id=355503&type=3&count=30&minCursor=0&maxCursor=0&shareUid=&lang=',
type: 'signature',
headers: {
'user-agent': 'okhttp',
},
proxy: '',
number: 5,
});
});
it('signUrl should return a valid signature', async () => {
const signature: string = await instance.signUrl();
expect(signature).not.toBeNull();
});
it('Throw error if input url is empty', async () => {
instance.input = '';
await expect(instance.signUrl()).rejects.toBe(`Url is missing`);
});
});
describe('TikTok Scraper MODULE(promise): getHashtagInfo', () => {
let instance;
const hasthagName = 'summer';
beforeAll(() => {
instance = new TikTokScraper({
download: false,
asyncDownload: 5,
asyncScraping: 5,
filetype: '',
filepath: '',
input: hasthagName,
type: 'single_hashtag',
headers: {
'user-agent': 'okhttp',
},
proxy: '',
number: 5,
});
});
it('getHashtagInfo should return a valid Object', async () => {
const hashtag: HashtagMetadata = await instance.getHashtagInfo();
expect(hashtag).toEqual({
challenge: {
id: '99770',
title: 'duett',
desc: 'Habt ihr schon unsere neue Duett-Funktion gecheckt? Oben, unten, links, rechts alles möglich jetzt.',
profileThumb: 'https://p16-va-default.akamaized.net/obj/musically-maliva-obj/92760d2f9cce09720b20ae060081efc8',
profileMedium: 'https://p16-va-default.akamaized.net/obj/musically-maliva-obj/92760d2f9cce09720b20ae060081efc8',
profileLarger: 'https://p16-va-default.akamaized.net/obj/musically-maliva-obj/92760d2f9cce09720b20ae060081efc8',
coverThumb: 'https://p16-va-default.akamaized.net/obj/musically-maliva-obj/fa5fcd3ee0a9581fc26d9e3b811e428e',
coverMedium: 'https://p16-va-default.akamaized.net/obj/musically-maliva-obj/fa5fcd3ee0a9581fc26d9e3b811e428e',
coverLarger: 'https://p16-va-default.akamaized.net/obj/musically-maliva-obj/fa5fcd3ee0a9581fc26d9e3b811e428e',
isCommerce: false,
},
stats: { videoCount: 0, viewCount: 37100000000 },
shareMeta: { title: '#duett on TikTok', desc: '37099.0m views - Watch awesome short videos created with trending hashtag #duett' },
challengeAnnouncement: {},
});
});
it('Throw error if input hashtag is empty', async () => {
instance.input = '';
await expect(instance.getHashtagInfo()).rejects.toBe(`Hashtag is missing`);
});
it(`Throw error if hashtag doesn't exist`, async () => {
instance.input = 'na';
await expect(instance.getHashtagInfo()).rejects.toBe(`Can't find hashtag: na`);
});
});
describe('TikTok Scraper MODULE(promise): getUserProfileInfo', () => {
let instance;
const userName = 'tiktok';
beforeAll(() => {
instance = new TikTokScraper({
download: false,
asyncDownload: 5,
asyncScraping: 5,
filetype: '',
filepath: '',
input: userName,
type: 'single_user',
headers: {
'user-agent': 'okhttp',
},
proxy: '',
number: 5,
});
});
it('getUserProfileInfo should return a valid Object', async () => {
const user: UserMetadata = await instance.getUserProfileInfo();
expect(user).toEqual({
user: {
id: '107955',
uniqueId: 'tiktok',
nickname: 'TikTok',
avatarThumb:
'https://p16-sign-va.tiktokcdn.com/musically-maliva-obj/1645136815763462~c5_100x100.jpeg?x-expires=1610028000&x-signature=kEnsi2vZJE9DYy5q3UH%2FKAIH8pI%3D',
avatarMedium:
'https://p16-sign-va.tiktokcdn.com/musically-maliva-obj/1645136815763462~c5_720x720.jpeg?x-expires=1610028000&x-signature=ZcG9nv927kBXHRsEh9ZeFAGjqzM%3D',
avatarLarger:
'https://p16-sign-va.tiktokcdn.com/musically-maliva-obj/1645136815763462~c5_1080x1080.jpeg?x-expires=1610028000&x-signature=44JuBpJgUlN4dau%2B3eFemKgTrJI%3D',
verified: true,
createTime: 1425144149,
secUid: 'MS4wLjABAAAAv7iSuuXDJGDvJkmH_vz1qkDZYo1apxgzaxdBSeIuPiM',
secret: false,
ftc: false,
relation: 1,
openFavorite: true,
commentSetting: 0,
duetSetting: 0,
stitchSetting: 0,
privateAccount: false,
shortId: '0',
signature: 'It Starts On TikTok',
},
stats: { followingCount: 496, followerCount: 50100000, heartCount: 246000000, videoCount: 118, diggCount: 53, heart: 246000000 },
itemList: [],
});
});
it('Throw error if input username is empty', async () => {
instance.input = '';
await expect(instance.getUserProfileInfo()).rejects.toBe(`Username is missing`);
});
});
describe('TikTok Scraper CLI: user(save progress)', () => {
let instance;
let posts: Result;
beforeAll(async () => {
jest.spyOn(fs, 'writeFile').mockImplementation((file, option, cb) => cb(null));
jest.spyOn(fs, 'readFile').mockImplementation((file, cb) => cb(null, Buffer.from('0')));
instance = new TikTokScraper({
download: true,
cli: true,
zip: true,
store_history: true,
test: true,
asyncDownload: 5,
asyncScraping: 5,
filetype: '',
filepath: '',
input: 'tiktok',
type: 'user',
headers: {
'user-agent': 'okhttp',
},
proxy: '',
number: 5,
});
posts = await instance.scrape();
});
afterAll(() => {
jest.restoreAllMocks();
});
it('fs.readFile should be called 2 times', async () => {
expect(fs.readFile).toHaveBeenCalledTimes(2);
});
it('fs.writeFile should be called 2 times', async () => {
expect(fs.writeFile).toHaveBeenCalledTimes(2);
});
it('result should contain a valid file name for the Zip file', async () => {
expect(posts.zip).toMatch(/^(\w+)_([0-9]{13}).zip$/);
});
});
describe('TikTok Scraper MODULE(promise): getVideoMeta', () => {
let instance;
beforeEach(() => {
instance = new TikTokScraper({
download: false,
asyncDownload: 5,
asyncScraping: 5,
filetype: '',
filepath: '',
input: 'https://www.tiktok.com/@tiktok/video/6807491984882765062',
type: 'video_meta',
headers: {
'user-agent': CONST.userAgent(),
},
proxy: '',
number: 5,
hdVideo: false,
});
});
it('getVideoMeta should return a valid Object', async () => {
const post: PostCollector = await instance.getVideoMeta();
expect(post).toEqual({
id: '6881450806688664838',
text: 'Good vibes only 🤙 @420doggface208 @mickfleetwood @tomhayes603',
createTime: 1602212662,
authorMeta: {
id: '107955',
secUid: 'MS4wLjABAAAAv7iSuuXDJGDvJkmH_vz1qkDZYo1apxgzaxdBSeIuPiM',
name: 'tiktok',
nickName: 'TikTok',
following: 491,
fans: 48300000,
heart: 241100000,
video: 112,
digg: 35,
verified: true,
private: false,
signature: 'Make Your Day',
avatar:
'https://p16-sign-va.tiktokcdn.com/musically-maliva-obj/1645136815763462~c5_1080x1080.jpeg?x-expires=1603573200&x-signature=4%2FrCxmt8FiH7M9RY%2Bx%2F7WVzd0Og%3D',
},
musicMeta: {
musicId: '6881450829518293766',
musicName: 'original sound',
musicAuthor: 'TikTok',
musicOriginal: true,
coverThumb:
'https://p16-sign-va.tiktokcdn.com/musically-maliva-obj/1645136815763462~c5_100x100.jpeg?x-expires=1603573200&x-signature=XGaOhkftgl2fNr%2BT1OpxPVWUWY4%3D',
coverMedium:
'https://p16-sign-va.tiktokcdn.com/musically-maliva-obj/1645136815763462~c5_720x720.jpeg?x-expires=1603573200&x-signature=bl%2BxXbD9ME6Tt4VNcWtPDAX4PZI%3D',
coverLarge:
'https://p16-sign-va.tiktokcdn.com/musically-maliva-obj/1645136815763462~c5_1080x1080.jpeg?x-expires=1603573200&x-signature=4%2FrCxmt8FiH7M9RY%2Bx%2F7WVzd0Og%3D',
duration: 15,
},
imageUrl:
'https://p16-sign-sg.tiktokcdn.com/obj/tos-maliva-p-0068/5f1e128e900c4008bd6d612964ef7d1b?x-expires=1603508400&x-signature=lXSV%2BKG4%2B8G%2BGJREfeNEys6m3eg%3D',
videoUrl:
'https://v16-web-newkey.tiktokcdn.com/2ea83f8b07e61eb2844a644d0b1ff238/5f939968/video/tos/useast2a/tos-useast2a-pve-0068/2141262fa24c4f7687f2d6b0df121616/?a=1988&br=3316&bt=1658&cr=0&cs=0&cv=1&dr=0&ds=3&er=&l=202010232102490101902192101109C365&lr=tiktok_m&mime_type=video_mp4&qs=0&rc=anFwZTh4N2R3dzMzZzczM0ApNWY0O2QzaDszNzxlOTRlN2dkbzVlbGRkM3NfLS0xMTZzc2EwNC4vLWEuYS5hMmFiMy06Yw%3D%3D&vl=&vr=',
videoUrlNoWaterMark: '',
videoApiUrlNoWaterMark: '',
videoMeta: {
width: 576,
height: 1024,
ratio: '720p',
duration: 15,
duetInfo: { duetFromId: '0' },
duetEnabled: true,
stitchEnabled: true,
},
covers: {
default:
'https://p16-sign-sg.tiktokcdn.com/obj/tos-maliva-p-0068/5f1e128e900c4008bd6d612964ef7d1b?x-expires=1603508400&x-signature=lXSV%2BKG4%2B8G%2BGJREfeNEys6m3eg%3D',
origin:
'https://p16-sign-sg.tiktokcdn.com/obj/tos-maliva-p-0068/fe538f49b1334b75890ea3d741d3e357_1602212663?x-expires=1603508400&x-signature=JlLy1gxqASLp0msjeJSxMEFco7I%3D',
},
diggCount: 1300000,
shareCount: 13100,
playCount: 25700,
secretID: 'awesome',
commentCount: 25700,
downloaded: false,
mentions: ['@420doggface208', '@mickfleetwood', '@tomhayes603'],
hashtags: [],
effectStickers: [],
});
});
it('Throw error if input url is empty', async () => {
instance.input = '';
await expect(instance.getVideoMeta()).rejects.toBe(`Url is missing`);
});
it(`Throw error if user has provided incorrect URL`, async () => {
instance.input = 'na';
await expect(instance.getVideoMeta()).rejects.toBe(`Can't extract video metadata: na`);
});
});
/* eslint-disable no-console */
/* eslint-disable no-await-in-loop */
/* eslint-disable no-underscore-dangle */
import rp, { OptionsWithUri } from 'request-promise';
import { CookieJar } from 'request';
import { tmpdir } from 'os';
import { writeFile, readFile, mkdir } from 'fs';
import { Parser } from 'json2csv';
import ora, { Ora } from 'ora';
import { fromCallback } from 'bluebird';
import { EventEmitter } from 'events';
import { SocksProxyAgent } from 'socks-proxy-agent';
import { forEachLimit } from 'async';
import { URLSearchParams } from 'url';
import CONST from '../constant';
import { sign, makeid } from '../helpers';
import {
PostCollector,
ScrapeType,
TikTokConstructor,
Result,
MusicMetadata,
RequestQuery,
History,
Proxy,
FeedItems,
ItemListData,
TikTokMetadata,
UserMetadata,
HashtagMetadata,
Headers,
WebHtmlUserMetadata,
VideoMetadata,
} from '../types';
import { Downloader } from '../core';
export class TikTokScraper extends EventEmitter {
private mainHost: string;
private userIdStore: string;
private download: boolean;
private filepath: string;
private json2csvParser: Parser<any>;
private filetype: string;
private input: string;
private proxy: string[] | string;
private strictSSL: boolean;
private number: number;
private since: number;
private asyncDownload: number;
private asyncScraping: () => number;
private collector: PostCollector[];
private event: boolean;
private scrapeType: ScrapeType;
private cli: boolean;
private spinner: Ora;
private byUserId: boolean;
private storeHistory: boolean;
private historyPath: string;
private idStore: string;
public Downloader: Downloader;
private storeValue: string = '';
private maxCursor: number;
private noWaterMark: boolean;
private noDuplicates: string[];
private timeout: number;
private bulk: boolean;
private validHeaders: boolean;
private csrf: string;
private zip: boolean;
private fileName: string;
private test: boolean;
private hdVideo: boolean;
private webHookUrl: string;
private method: string;
private httpRequests: {
good: number;
bad: number;
};
public headers: Headers;
private sessionList: string[];
private verifyFp: string;
private store: string[];
public cookieJar: CookieJar;
constructor({
download,
filepath,
filetype,
proxy,
strictSSL = true,
asyncDownload,
cli = false,
event = false,
progress = false,
input,
number,
since,
type,
by_user_id = false,
store_history = false,
historyPath = '',
noWaterMark = false,
useTestEndpoints = false,
fileName = '',
timeout = 0,
bulk = false,
zip = false,
test = false,
hdVideo = false,
webHookUrl = '',
method = 'POST',
headers,
verifyFp = '',
sessionList = [],
}: TikTokConstructor) {
super();
this.userIdStore = '';
this.verifyFp = verifyFp;
this.mainHost = useTestEndpoints ? 'https://t.tiktok.com/' : 'https://m.tiktok.com/';
this.headers = headers;
this.download = download;
this.filepath = process.env.SCRAPING_FROM_DOCKER ? '/usr/app/files' : filepath || '';
this.fileName = fileName;
this.json2csvParser = new Parser({ flatten: true });
this.filetype = filetype;
this.input = input;
this.test = test;
this.proxy = proxy;
this.strictSSL = strictSSL;
this.number = number;
this.since = since;
this.csrf = '';
this.zip = zip;
// Cookie jar. Where all valid cookies will be stored
this.cookieJar = rp.jar();
this.hdVideo = hdVideo;
this.sessionList = sessionList;
this.asyncDownload = asyncDownload || 5;
this.asyncScraping = (): number => {
switch (this.scrapeType) {
case 'user':
case 'trend':
return 1;
default:
return 1;
}
};
this.collector = [];
this.event = event;
this.scrapeType = type;
this.cli = cli;
this.spinner = ora({ text: 'TikTok Scraper Started', stream: process.stdout });
this.byUserId = by_user_id;
this.storeHistory = cli && download && store_history;
this.historyPath = process.env.SCRAPING_FROM_DOCKER ? '/usr/app/files' : historyPath || tmpdir();
this.idStore = '';
this.noWaterMark = noWaterMark;
this.maxCursor = 0;
this.noDuplicates = [];
this.timeout = timeout;
this.bulk = bulk;
this.validHeaders = false;
this.Downloader = new Downloader({
progress,
cookieJar: this.cookieJar,
proxy,
noWaterMark,
headers,
filepath: process.env.SCRAPING_FROM_DOCKER ? '/usr/app/files' : filepath || '',
bulk,
});
this.webHookUrl = webHookUrl;
this.method = method;
this.httpRequests = {
good: 0,
bad: 0,
};
this.store = [];
}
/**
* Get file destination(csv, zip, json)
*/
private get fileDestination(): string {
if (this.fileName) {
if (!this.zip && this.download) {
return `${this.folderDestination}/${this.fileName}`;
}
return this.filepath ? `${this.filepath}/${this.fileName}` : this.fileName;
}
switch (this.scrapeType) {
case 'user':
case 'hashtag':
if (!this.zip && this.download) {
return `${this.folderDestination}/${this.input}_${Date.now()}`;
}
return this.filepath ? `${this.filepath}/${this.input}_${Date.now()}` : `${this.input}_${Date.now()}`;
default:
if (!this.zip && this.download) {
return `${this.folderDestination}/${this.scrapeType}_${Date.now()}`;
}
return this.filepath ? `${this.filepath}/${this.scrapeType}_${Date.now()}` : `${this.scrapeType}_${Date.now()}`;
}
}
/**
* Get folder destination, where all downloaded posts will be saved
*/
private get folderDestination(): string {
switch (this.scrapeType) {
case 'user':
return this.filepath ? `${this.filepath}/${this.input}` : this.input;
case 'hashtag':
return this.filepath ? `${this.filepath}/#${this.input}` : `#${this.input}`;
case 'music':
return this.filepath ? `${this.filepath}/music_${this.input}` : `music_${this.input}`;
case 'trend':
return this.filepath ? `${this.filepath}/trend` : `trend`;
case 'video':
return this.filepath ? `${this.filepath}/video` : `video`;
default:
throw new TypeError(`${this.scrapeType} is not supported`);
}
}
/**
* Get api endpoint
*/
private get getApiEndpoint(): string {
switch (this.scrapeType) {
case 'user':
return `${this.mainHost}api/post/item_list/`;
case 'trend':
return `${this.mainHost}api/recommend/item_list/`;
case 'hashtag':
return `${this.mainHost}api/challenge/item_list/`;
case 'music':
return `${this.mainHost}api/music/item_list/`;
default:
throw new TypeError(`${this.scrapeType} is not supported`);
}
}
/**
* Get proxy
*/
private get getProxy(): Proxy {
const proxy =
Array.isArray(this.proxy) && this.proxy.length ? this.proxy[Math.floor(Math.random() * this.proxy.length)] : (this.proxy as string);
if (proxy) {
if (proxy.indexOf('socks4://') > -1 || proxy.indexOf('socks5://') > -1) {
return {
socks: true,
proxy: new SocksProxyAgent(proxy),
};
}
return {
socks: false,
proxy,
};
}
return {
socks: false,
proxy: '',
};
}
/**
* Main request method
* @param {} OptionsWithUri
*/
private request<T>(
{ uri, method, qs, body, form, headers, json, gzip, followAllRedirects, simple = true }: OptionsWithUri,
bodyOnly = true,
): Promise<T> {
// eslint-disable-next-line no-async-promise-executor
return new Promise(async (resolve, reject) => {
const proxy = this.getProxy;
const options = ({
jar: this.cookieJar,
uri,
method,
...(qs ? { qs } : {}),
...(body ? { body } : {}),
...(form ? { form } : {}),
headers: {
...this.headers,
...headers,
...(this.csrf ? { 'x-secsdk-csrf-token': this.csrf } : {}),
},
...(json ? { json: true } : {}),
...(gzip ? { gzip: true } : {}),
resolveWithFullResponse: true,
followAllRedirects: followAllRedirects || false,
simple,
...(proxy.proxy && proxy.socks ? { agent: proxy.proxy } : {}),
...(proxy.proxy && !proxy.socks ? { proxy: `http://${proxy.proxy}/` } : {}),
...(this.strictSSL === false ? { rejectUnauthorized: false } : {}),
timeout: 10000,
} as unknown) as OptionsWithUri;
const session = this.sessionList[Math.floor(Math.random() * this.sessionList.length)];
if (session) {
this.cookieJar.setCookie(session, 'https://tiktok.com');
}
/**
* Set tt_webid_v2 cookie to access video url
*/
const cookies = this.cookieJar.getCookieString('https://tiktok.com');
if (cookies.indexOf('tt_webid_v2') === -1) {
this.cookieJar.setCookie(`tt_webid_v2=69${makeid(17)}; Domain=tiktok.com; Path=/; Secure; hostOnly=false`, 'https://tiktok.com');
}
try {
const response = await rp(options);
// Extract valid csrf token
if (options.method === 'HEAD') {
const csrf = response.headers['x-ware-csrf-token'];
this.csrf = csrf.split(',')[1] as string;
}
setTimeout(() => {
resolve(bodyOnly ? response.body : response);
}, this.timeout);
} catch (error) {
reject(error);
}
});
}
private returnInitError(error) {
if (this.cli && !this.bulk) {
this.spinner.stop();
}
if (this.event) {
this.emit('error', error);
} else {
throw error;
}
}
/**
* Initiate scraping process
*/
// eslint-disable-next-line consistent-return
public async scrape(): Promise<Result | any> {
if (this.cli && !this.bulk) {
this.spinner.start();
}
if (this.download && !this.zip) {
try {
await fromCallback(cb => mkdir(this.folderDestination, { recursive: true }, cb));
} catch (error) {
return this.returnInitError(error.message);
}
}
if (!this.scrapeType || CONST.scrape.indexOf(this.scrapeType) === -1) {
return this.returnInitError(`Missing scraping type. Scrape types: ${CONST.scrape} `);
}
if (this.scrapeType !== 'trend' && !this.input) {
return this.returnInitError('Missing input');
}
await this.mainLoop();
if (this.event) {
return this.emit('done', 'completed');
}
if (this.storeHistory) {
await this.getDownloadedVideosFromHistory();
}
if (this.noWaterMark) {
await this.withoutWatermark();
}
const [json, csv, zip] = await this.saveCollectorData();
if (this.storeHistory) {
// We need to make sure that we save data only about downloaded videos
this.collector.forEach(item => {
if (this.store.indexOf(item.id) === -1 && item.downloaded) {
this.store.push(item.id);
}
});
await this.storeDownloadProgress();
}
if (this.webHookUrl) {
await this.sendDataToWebHookUrl();
}
return {
headers: { ...this.headers, cookie: this.cookieJar.getCookieString('https://tiktok.com') },
collector: this.collector,
...(this.download ? { zip } : {}),
...(this.filetype === 'all' ? { json, csv } : {}),
...(this.filetype === 'json' ? { json } : {}),
...(this.filetype === 'csv' ? { csv } : {}),
...(this.webHookUrl ? { webhook: this.httpRequests } : {}),
};
}
/**
* Extract uniq video id and create the url to the video without the watermark
*/
private withoutWatermark() {
return new Promise((resolve, reject) => {
forEachLimit(
this.collector,
5,
async (item: PostCollector) => {
try {
item.videoApiUrlNoWaterMark = await this.extractVideoId(item);
item.videoUrlNoWaterMark = await this.getUrlWithoutTheWatermark(item.videoApiUrlNoWaterMark!);
} catch {
throw new Error(`Can't extract unique video id`);
}
},
err => {
if (err) {
return reject(err);
}
resolve(null);
},
);
});
}
/**
* Extract uniq video id
* All videos after July 27 2020 do not store unique video id
* it means that we can't extract url to the video without the watermark
* @param uri
*/
// eslint-disable-next-line class-methods-use-this
private async extractVideoId(item: PostCollector): Promise<string> {
if (item.createTime > 1595808000) {
return '';
}
try {
const result = await rp({
uri: item.videoUrl,
headers: this.headers,
});
const position = Buffer.from(result).indexOf('vid:');
if (position !== -1) {
const id = Buffer.from(result)
.slice(position + 4, position + 36)
.toString();
return `https://api2-16-h2.musical.ly/aweme/v1/play/?video_id=${id}&vr_type=0&is_play_url=1&source=PackSourceEnum_PUBLISH&media_type=4${
this.hdVideo ? `&ratio=default&improve_bitrate=1` : ''
}`;
}
} catch {
// continue regardless of error
}
return '';
}
/**
* Get temporary url to the video without the watermark
* The url has expiration time (between 5-20 minutes+-)
* @param uri
*/
private async getUrlWithoutTheWatermark(uri: string): Promise<string> {
if (!uri) {
return '';
}
const options = {
uri,
method: 'GET',
headers: {
'user-agent':
'com.zhiliaoapp.musically/2021600040 (Linux; U; Android 5.0; en_US; SM-N900T; Build/LRX21V; Cronet/TTNetVersion:6c7b701a 2020-04-23 QuicVersion:0144d358 2020-03-24)',
'sec-fetch-mode': 'navigate',
},
followAllRedirects: true,
simple: false,
};
try {
const response: {
request: { uri: { href: string } };
} = await this.request(options, false);
return response.request.uri.href;
} catch (err) {
throw new Error(`Can't extract video url without the watermark`);
}
}
/**
* Main loop that collects all required metadata from the tiktok web api
*/
private mainLoop(): Promise<any> {
return new Promise((resolve, reject) => {
const taskArray = Array.from({ length: 1000 }, (v, k) => k + 1);
forEachLimit(
taskArray,
this.asyncScraping(),
(item, cb) => {
switch (this.scrapeType) {
case 'user':
this.getUserId()
.then(query => this.submitScrapingRequest({ ...query, cursor: this.maxCursor }, true))
.then(kill => cb(kill || null))
.catch(error => cb(error));
break;
case 'hashtag':
this.getHashTagId()
.then(query => this.submitScrapingRequest({ ...query, cursor: item === 1 ? 0 : (item - 1) * query.count! }, true))
.then(kill => cb(kill || null))
.catch(error => cb(error));
break;
case 'trend':
this.getTrendingFeedQuery()
.then(query => this.submitScrapingRequest({ ...query }, true))
.then(kill => cb(kill || null))
.catch(error => cb(error));
break;
case 'music':
this.getMusicFeedQuery()
.then(query => this.submitScrapingRequest({ ...query, cursor: item === 1 ? 0 : (item - 1) * query.count! }, true))
.then(kill => cb(kill || null))
.catch(error => cb(error));
break;
default:
break;
}
},
err => {
if (err && err !== true) {
return reject(err);
}
resolve(null);
},
);
});
}
/**
* Submit request to the TikTok web API
* Collect received metadata
*/
private async submitScrapingRequest(query: RequestQuery, updatedApiResponse = false): Promise<boolean> {
try {
if (!this.validHeaders) {
/**
* As of August 13, 2021 the trend api endpoint requires ttwid cookie value that can be extracted by sending GET request to the tiktok trending page
*/
if (this.scrapeType === 'trend') {
await this.getValidHeaders(`https://www.tiktok.com/foryou`, false, 'GET');
}
this.validHeaders = true;
}
const result = await this.scrapeData<ItemListData>(query);
if (result.statusCode !== 0) {
throw new Error(`Can't scrape more posts`);
}
const { hasMore, maxCursor, cursor } = result;
if ((updatedApiResponse && !result.itemList) || (!updatedApiResponse && !result.items)) {
throw new Error('No more posts');
}
const { done } = await this.collectPosts(updatedApiResponse ? result.itemList : result.items);
if (!hasMore) {
console.error(`Only ${this.collector.length} results could be found.`);
return true;
}
if (done) {
return true;
}
this.maxCursor = parseInt(maxCursor === undefined ? cursor : maxCursor, 10);
return false;
} catch (error) {
throw error.message ? new Error(error.message) : error;
}
}
/**
* Store collector data in the CSV and/or JSON files
*/
private async saveCollectorData(): Promise<string[]> {
if (this.download) {
if (this.cli) {
this.spinner.stop();
}
if (this.collector.length && !this.test) {
await this.Downloader.downloadPosts({
zip: this.zip,
folder: this.folderDestination,
collector: this.collector,
fileName: this.fileDestination,
asyncDownload: this.asyncDownload,
});
}
}
let json = '';
let csv = '';
let zip = '';
if (this.collector.length) {
json = `${this.fileDestination}.json`;
csv = `${this.fileDestination}.csv`;
zip = this.zip ? `${this.fileDestination}.zip` : this.folderDestination;
await this.saveMetadata({ json, csv });
}
if (this.cli) {
this.spinner.stop();
}
return [json, csv, zip];
}
/**
* Save post metadata
* @param param0
*/
public async saveMetadata({ json, csv }) {
if (this.collector.length) {
switch (this.filetype) {
case 'json':
await fromCallback(cb => writeFile(json, JSON.stringify(this.collector), cb));
break;
case 'csv':
await fromCallback(cb => writeFile(csv, this.json2csvParser.parse(this.collector), cb));
break;
case 'all':
await Promise.all([
await fromCallback(cb => writeFile(json, JSON.stringify(this.collector), cb)),
await fromCallback(cb => writeFile(csv, this.json2csvParser.parse(this.collector), cb)),
]);
break;
default:
break;
}
}
}
/**
* If option -s is being used then we need to
* retrieve already downloaded video id's to prevent them to be downloaded again
*/
private async getDownloadedVideosFromHistory() {
try {
const readFromStore = (await fromCallback(cb =>
readFile(`${this.historyPath}/${this.storeValue}.json`, { encoding: 'utf-8' }, cb),
)) as string;
this.store = JSON.parse(readFromStore);
} catch {
// continue regardless of error
}
this.collector = this.collector.map(item => {
if (this.store.indexOf(item.id) !== -1) {
item.repeated = true;
}
return item;
});
this.collector = this.collector.filter(item => !item.repeated);
}
/**
* Store progress to avoid downloading duplicates
* Only available from the CLI
*/
private async storeDownloadProgress() {
const historyType = this.scrapeType === 'trend' ? 'trend' : `${this.scrapeType}_${this.input}`;
const totalNewDownloadedVideos = this.collector.filter(item => item.downloaded).length;
if (this.storeValue && totalNewDownloadedVideos) {
let history = {} as History;
try {
const readFromStore = (await fromCallback(cb =>
readFile(`${this.historyPath}/tiktok_history.json`, { encoding: 'utf-8' }, cb),
)) as string;
history = JSON.parse(readFromStore);
} catch (error) {
history[historyType] = {
type: this.scrapeType,
input: this.input,
downloaded_posts: 0,
last_change: new Date(),
file_location: `${this.historyPath}/${this.storeValue}.json`,
};
}
if (!history[historyType]) {
history[historyType] = {
type: this.scrapeType,
input: this.input,
downloaded_posts: 0,
last_change: new Date(),
file_location: `${this.historyPath}/${this.storeValue}.json`,
};
}
history[historyType] = {
type: this.scrapeType,
input: this.input,
downloaded_posts: history[historyType].downloaded_posts + totalNewDownloadedVideos,
last_change: new Date(),
file_location: `${this.historyPath}/${this.storeValue}.json`,
};
try {
await fromCallback(cb => writeFile(`${this.historyPath}/${this.storeValue}.json`, JSON.stringify(this.store), cb));
} catch {
// continue regardless of error
}
try {
await fromCallback(cb => writeFile(`${this.historyPath}/tiktok_history.json`, JSON.stringify(history), cb));
} catch {
// continue regardless of error
}
}
}
/**
* Collect post data from the API response
* @param posts
*/
private collectPosts(posts: FeedItems[]) {
const result = {
done: false,
};
for (let i = 0; i < posts.length; i += 1) {
if (result.done) {
break;
}
if (this.since && posts[i].createTime < this.since) {
result.done = CONST.chronologicalTypes.indexOf(this.scrapeType) !== -1;
if (result.done) {
break;
} else {
continue;
}
}
if (this.noDuplicates.indexOf(posts[i].id) === -1) {
this.noDuplicates.push(posts[i].id);
const item: PostCollector = {
id: posts[i].id,
secretID: posts[i].video.id,
text: posts[i].desc,
createTime: posts[i].createTime,
authorMeta: {
id: posts[i].author.id,
secUid: posts[i].author.secUid,
name: posts[i].author.uniqueId,
nickName: posts[i].author.nickname,
verified: posts[i].author.verified,
signature: posts[i].author.signature,
avatar: posts[i].author.avatarLarger,
following: posts[i].authorStats.followingCount,
fans: posts[i].authorStats.followerCount,
heart: posts[i].authorStats.heartCount,
video: posts[i].authorStats.videoCount,
digg: posts[i].authorStats.diggCount,
},
...(posts[i].music
? {
musicMeta: {
musicId: posts[i].music.id,
musicName: posts[i].music.title,
musicAuthor: posts[i].music.authorName,
musicOriginal: posts[i].music.original,
musicAlbum: posts[i].music.album,
playUrl: posts[i].music.playUrl,
coverThumb: posts[i].music.coverThumb,
coverMedium: posts[i].music.coverMedium,
coverLarge: posts[i].music.coverLarge,
duration: posts[i].music.duration,
},
}
: {}),
covers: {
default: posts[i].video.cover,
origin: posts[i].video.originCover,
dynamic: posts[i].video.dynamicCover,
},
webVideoUrl: `https://www.tiktok.com/@${posts[i].author.uniqueId}/video/${posts[i].id}`,
videoUrl: posts[i].video.downloadAddr,
videoUrlNoWaterMark: '',
videoApiUrlNoWaterMark: '',
videoMeta: {
height: posts[i].video.height,
width: posts[i].video.width,
duration: posts[i].video.duration,
},
diggCount: posts[i].stats.diggCount,
shareCount: posts[i].stats.shareCount,
playCount: posts[i].stats.playCount,
commentCount: posts[i].stats.commentCount,
downloaded: false,
mentions: posts[i].desc.match(/(@\w+)/g) || [],
hashtags: posts[i].challenges
? posts[i].challenges.map(({ id, title, desc, coverLarger }) => ({
id,
name: title,
title: desc,
cover: coverLarger,
}))
: [],
effectStickers: posts[i].effectStickers
? posts[i].effectStickers.map(({ ID, name }) => ({
id: ID,
name,
}))
: [],
};
if (this.event) {
this.emit('data', item);
this.collector.push({} as PostCollector);
} else {
this.collector.push(item);
}
}
if (this.number) {
if (this.collector.length >= this.number) {
result.done = true;
break;
}
}
}
return result;
}
/**
* In order to execute some request, we need to extract valid cookie headers
* This request is being executed only once per run
*/
private async getValidHeaders(url = '', signUrl = true, method = 'HEAD') {
const options = {
uri: url,
method,
...(signUrl
? {
qs: {
_signature: sign(url, this.headers['user-agent']),
},
}
: {}),
headers: {
'x-secsdk-csrf-request': 1,
'x-secsdk-csrf-version': '1.2.5',
},
};
try {
await this.request<string>(options);
} catch (error) {
throw new Error(error.message);
}
}
private async scrapeData<T>(qs: RequestQuery): Promise<T> {
this.storeValue = this.scrapeType === 'trend' ? 'trend' : qs.id || qs.challengeID! || qs.musicID!;
const unsignedURL = `${this.getApiEndpoint}?${new URLSearchParams(qs as any).toString()}`;
const _signature = sign(unsignedURL, this.headers['user-agent']);
const options = {
uri: this.getApiEndpoint,
method: 'GET',
qs: {
...qs,
_signature,
},
json: true,
};
try {
const response = await this.request<T>(options);
return response;
} catch (error) {
throw new Error(error.message);
}
}
/**
* Get trending feed query
*/
// eslint-disable-next-line class-methods-use-this
private async getTrendingFeedQuery(): Promise<RequestQuery> {
return {
aid: 1988,
app_name: 'tiktok_web',
device_platform: 'web_pc',
lang: '',
count: 30,
from_page: 'fyp',
itemID: 1,
};
}
/**
* Get music feed query
*/
private async getMusicFeedQuery(): Promise<RequestQuery> {
const musicIdRegex = /.com\/music\/[\w+-]+-(\d{15,22})/.exec(this.input);
if (musicIdRegex) {
this.input = musicIdRegex[1] as string;
}
return {
musicID: this.input,
lang: '',
aid: 1988,
count: 30,
cursor: 0,
verifyFp: '',
};
}
/**
* Get hashtag ID
*/
private async getHashTagId(): Promise<RequestQuery> {
if (this.idStore) {
return {
challengeID: this.idStore,
count: 30,
cursor: 0,
aid: 1988,
verifyFp: this.verifyFp,
};
}
const id = encodeURIComponent(this.input);
const query = {
uri: `${this.mainHost}node/share/tag/${id}?uniqueId=${id}`,
qs: {
user_agent: this.headers['user-agent'],
},
method: 'GET',
json: true,
};
try {
const response = await this.request<TikTokMetadata>(query);
if (response.statusCode !== 0) {
throw new Error(`Can not find the hashtag: ${this.input}`);
}
this.idStore = response.challengeInfo.challenge.id;
return {
challengeID: this.idStore,
count: 30,
cursor: 0,
aid: 1988,
verifyFp: this.verifyFp,
};
} catch (error) {
throw new Error(error.message);
}
}
/**
* Get user ID
*/
private async getUserId(): Promise<RequestQuery> {
if (this.byUserId || this.idStore) {
return {
id: this.userIdStore,
secUid: this.idStore ? this.idStore : this.input,
lang: '',
aid: 1988,
count: 30,
cursor: 0,
app_name: 'tiktok_web',
device_platform: 'web_pc',
cookie_enabled: true,
history_len: 2,
focus_state: true,
is_fullscreen: false,
};
}
try {
const response = await this.getUserProfileInfo();
this.idStore = response.user.secUid;
this.userIdStore = response.user.id;
return {
id: this.userIdStore,
aid: 1988,
secUid: this.idStore,
count: 30,
lang: '',
cursor: 0,
app_name: 'tiktok_web',
device_platform: 'web_pc',
cookie_enabled: true,
history_len: 2,
focus_state: true,
is_fullscreen: false,
};
} catch (error) {
throw new Error(error.message);
}
}
/**
* Get user profile information
* @param {} username
*/
public async getUserProfileInfo(): Promise<UserMetadata> {
if (!this.input) {
throw new Error(`Username is missing`);
}
const options = {
method: 'GET',
uri: `https://www.tiktok.com/@${encodeURIComponent(this.input)}`,
json: true,
};
try {
const response = await this.request<string>(options);
const breakResponse = response
.split(/<script id="__NEXT_DATA__" type="application\/json" nonce="[\w-]+" crossorigin="anonymous">/)[1]
.split(`</script>`)[0];
if (breakResponse) {
const userMetadata: WebHtmlUserMetadata = JSON.parse(breakResponse);
return userMetadata.props.pageProps.userInfo;
}
} catch (err) {
if (err.statusCode === 404) {
throw new Error('User does not exist');
}
}
throw new Error(`Can't extract user metadata from the html page. Make sure that user does exist and try to use proxy`);
}
/**
* Get hashtag information
* @param {} hashtag
*/
public async getHashtagInfo(): Promise<HashtagMetadata> {
if (!this.input) {
throw new Error(`Hashtag is missing`);
}
const query = {
uri: `${this.mainHost}node/share/tag/${this.input}?uniqueId=${this.input}`,
qs: {
appId: 1233,
},
method: 'GET',
json: true,
};
try {
const response = await this.request<TikTokMetadata>(query);
if (!response) {
throw new Error(`Can't find hashtag: ${this.input}`);
}
if (response.statusCode !== 0) {
throw new Error(`Can't find hashtag: ${this.input}`);
}
return response.challengeInfo;
} catch (error) {
throw new Error(error.message);
}
}
/**
* Get music information
* @param {} music link
*/
public async getMusicInfo(): Promise<MusicMetadata> {
if (!this.input) {
throw new Error(`Music is missing`);
}
const musicTitle = /music\/([\w-]+)-\d+/.exec(this.input);
const musicId = /music\/[\w-]+-(\d+)/.exec(this.input);
const query = {
uri: `https://www.tiktok.com/node/share/music/${musicTitle ? musicTitle[1] : ''}-${musicId ? musicId[1] : ''}`,
qs: {
screen_width: 1792,
screen_height: 1120,
lang: 'en',
priority_region: '',
referer: '',
root_referer: '',
app_language: 'en',
is_page_visible: true,
history_len: 6,
focus_state: true,
is_fullscreen: false,
aid: 1988,
app_name: 'tiktok_web',
timezone_name: '',
device_platform: 'web',
musicId: musicId ? musicId[1] : '',
musicName: musicTitle ? musicTitle[1] : '',
},
method: 'GET',
json: true,
};
const unsignedURL = `${query.uri}?${new URLSearchParams(query.qs as any).toString()}`;
const _signature = sign(unsignedURL, this.headers['user-agent']);
// @ts-ignore
query.qs._signature = _signature;
try {
const response = await this.request<TikTokMetadata>(query);
if (response.statusCode !== 0) {
throw new Error(`Can't find music data: ${this.input}`);
}
return response.musicInfo;
} catch (error) {
throw new Error(error.message);
}
}
/**
* Sign URL
* @param {}
*/
public async signUrl() {
if (!this.input) {
throw new Error(`Url is missing`);
}
return sign(this.input, this.headers['user-agent']);
}
/**
* Get video metadata from the HTML
* This method can be used if you aren't able to retrieve video metadata from a simple API call
* Can be slow
*/
private async getVideoMetadataFromHtml(): Promise<FeedItems> {
const options = {
uri: this.input,
method: 'GET',
json: true,
};
try {
const response = await this.request<string>(options);
if (!response) {
throw new Error(`Can't extract video meta data`);
}
if (response.includes("__NEXT_DATA__")){
const rawVideoMetadata = response
.split(/<script id="__NEXT_DATA__" type="application\/json" nonce="[\w-]+" crossorigin="anonymous">/)[1]
.split(`</script>`)[0];
const videoProps = JSON.parse(rawVideoMetadata);
const videoData = videoProps.props.pageProps.itemInfo.itemStruct;
return videoData as FeedItems;
}
if (response.includes('SIGI_STATE')) {
const rawVideoMetadata = response.split('<script id="SIGI_STATE" type="application/json">')[1].split('</script>')[0];
const videoProps = JSON.parse(rawVideoMetadata);
const videoData = Object.values(videoProps.ItemModule)[0];
return videoData as FeedItems;
}
throw new Error('No available parser for html page')
} catch (error) {
throw new Error(`Can't extract video metadata: ${this.input}`);
}
}
/**
* Get video metadata from the regular API endpoint
*/
private async getVideoMetadata(url = ''): Promise<FeedItems> {
const videoData = /tiktok.com\/(@[\w.-]+)\/video\/(\d+)/.exec(url || this.input);
if (videoData) {
const videoUsername = videoData[1];
const videoId = videoData[2];
const options = {
method: 'GET',
uri: `https://www.tiktok.com/node/share/video/${videoUsername}/${videoId}`,
json: true,
};
try {
const response = await this.request<VideoMetadata>(options);
if (response.statusCode === 0) {
return response.itemInfo.itemStruct;
}
} catch (err) {
if (err.statusCode === 404) {
throw new Error('Video does not exist');
}
}
}
throw new Error(`Can't extract video metadata: ${this.input}`);
}
/**
* Get video url without the watermark
* @param {}
*/
public async getVideoMeta(html = true): Promise<PostCollector> {
if (!this.input) {
throw new Error(`Url is missing`);
}
let videoData = {} as FeedItems;
if (html) {
videoData = await this.getVideoMetadataFromHtml();
} else {
videoData = await this.getVideoMetadata();
}
const videoItem = {
id: videoData.id,
secretID: videoData.video.id,
text: videoData.desc,
createTime: videoData.createTime,
authorMeta: {
id: videoData.author.id,
secUid: videoData.author.secUid,
name: videoData.author.uniqueId,
nickName: videoData.author.nickname,
following: videoData.authorStats.followingCount,
fans: videoData.authorStats.followerCount,
heart: videoData.authorStats.heartCount,
video: videoData.authorStats.videoCount,
digg: videoData.authorStats.diggCount,
verified: videoData.author.verified,
private: videoData.author.secret,
signature: videoData.author.signature,
avatar: videoData.author.avatarLarger,
},
musicMeta: {
musicId: videoData.music.id,
musicName: videoData.music.title,
musicAuthor: videoData.music.authorName,
musicOriginal: videoData.music.original,
coverThumb: videoData.music.coverThumb,
coverMedium: videoData.music.coverMedium,
coverLarge: videoData.music.coverLarge,
duration: videoData.music.duration,
},
imageUrl: videoData.video.cover,
videoUrl: videoData.video.playAddr,
videoUrlNoWaterMark: '',
videoApiUrlNoWaterMark: '',
videoMeta: {
width: videoData.video.width,
height: videoData.video.height,
ratio: videoData.video.ratio,
duration: videoData.video.duration,
duetEnabled: videoData.duetEnabled,
stitchEnabled: videoData.stitchEnabled,
duetInfo: videoData.duetInfo,
},
covers: {
default: videoData.video.cover,
origin: videoData.video.originCover,
},
diggCount: videoData.stats.diggCount,
shareCount: videoData.stats.shareCount,
playCount: videoData.stats.playCount,
commentCount: videoData.stats.commentCount,
downloaded: false,
mentions: videoData.desc.match(/(@\w+)/g) || [],
hashtags: videoData.challenges
? videoData.challenges.map(({ id, title, desc, profileLarger }) => ({
id,
name: title,
title: desc,
cover: profileLarger,
}))
: [],
effectStickers: videoData.effectStickers
? videoData.effectStickers.map(({ ID, name }) => ({
id: ID,
name,
}))
: [],
} as PostCollector;
try {
if (this.noWaterMark) {
videoItem.videoApiUrlNoWaterMark = await this.extractVideoId(videoItem);
videoItem.videoUrlNoWaterMark = await this.getUrlWithoutTheWatermark(videoItem.videoApiUrlNoWaterMark);
}
} catch {
// continue regardless of error
}
this.collector.push(videoItem);
return videoItem;
}
/**
* If webhook url was provided then send POST/GET request to the URL with the data from the this.collector
*/
private sendDataToWebHookUrl() {
return new Promise(resolve => {
forEachLimit(
this.collector,
3,
(item, cb) => {
rp({
uri: this.webHookUrl,
method: this.method,
headers: {
'user-agent': 'TikTok-Scraper',
},
...(this.method === 'POST' ? { body: item } : {}),
...(this.method === 'GET' ? { qs: { json: encodeURIComponent(JSON.stringify(item)) } } : {}),
json: true,
})
.then(() => {
this.httpRequests.good += 1;
})
.catch(() => {
this.httpRequests.bad += 1;
})
.finally(() => cb(null));
},
() => {
resolve(null);
},
);
});
}
}
(//*
//(((///(//((#//*//(/*//*/*,/(
. (((((((%##%/(((##%/((*///((/(#(((((//(#/(# . .. .
/##(#%%##%/#(((#%#((#%%%#((/(###(###(/((#(/(%%(,
. /%#%%%#%%#%##(%%%((#%%%%%#(%(#/####(#(####/(((((%%%%, .
.(%%%%%%##%###%%%#((#%%%%%(%##%/#(##%%#(#%%###(#(#/(%%##(
. ,%&%&&#%&%%#%%#%#%%%%%%%%%%&%%##(#/(###%###%%%%#######%###
(* .&&&&#&&&##%&%%%%%&%%%%%##%%%%%%#%(((###%%%%%%%%%%%%##(##%%.,,.,.
*. //&&%&&%#&%&&&&&&&&%%%&%%%&%&&&%&%%##((((##(##%#&#%%%%%(####*(/
*#(/#&&%%#%&&&&&&&%%%%%%%%%%%&&&&&&&&&&&&&%%%%####%#%%%%%%%(##%%/#(*.
%#((/&%%%&&&&&&&&&&&&&&&&&&&&&&&&&@@@@@@@@@@@@@@@@@@@@@&@@@#&(#((##
#/(., &&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&%%%%%&&&&&&&@@@@@@@&&%(##, *
&&&&&&&&&&&&&&&@.,.,,,.,,.,.,,,,,,,,,,,@@@@@@@&*,*,&&&,(%### /(
. .&&&&&&&&&&&&&&@,,......#@@@@@@@@,,,,,,*****/////**,,&&&&%#%/ ,
,,,*/&&&&&&&&&&&&&&&,,...,,,.,,,,,,,,,,**,**/*@@@@@@@@/,,&&&&&&##&&#%
***(&&&&&&&&&,,&&&@*.,%@@@@@@@@@@@,,,,,,,*****@&@@@@@@ *,@&&&&***,**,
/(*/&&&&&&&,,,,,,,,,%@%%&&@@@@@@&&@,,,,,,,*,**@&&@@@@ /*&@@&&&**,,*,*
/%/#&&&&&&&,.,,,,,,,,,, (&&@@@@@@@&,,,,,,,,*/**@@@@@%/**@@@&&&,//*,**
(//%&&&&&&&..,,,,,,,,,,,, ,@&&@@@@(,,,,,,.,*/////////**@@@@&&&&(/*/**
/#/&&&&&&&&&&,,,,,,,,,,,,,,,,,,,,,,,,,,,,,**////////**@@@@@&&&&//((((
#(&&&&&&&&@@@@@@@@@.,,,,,,,,,,,,,,,,,.,,,,,*///////,@@@@@@@@@&&/**/**
(&&&&&&&&&@@@@@@@@@@@@&,,,,,,,,,,,,,,,,,,*//////,#@@@@@@@@@@@&&/.((((
(&&&&&&&&&@@@@@@@@@@@@@@@@@@.,,,,,,,,,,,,,///,@@@@@@@@@@@@@@@@&&#(##(
,&&&&&&&@@@@@@@@@@@@@@@@@@@@@@@@/*//////@@@@@@@@@@@@@@@@@@@@@&&&(%###
%&&&&&&&&@@@@@@@@@@@@@@@@@@@%%,.. (@&..,%@@@@@@@@@@@@@@@@@@@@&&//(&#
#&&&&&&&@@@@@@@@@@@@@@@@@%#%#%%*, .@@@*.,%%%@@@@@@@@@@@@@@@@@@&&&(#%&
%&&&&&&&@@@@@@@@@@@@@&###%%%%%%%...,@&.,/%%%%#%@@@(%%%%#%%%%%&&&&/#(#
%&&&&&&&@@@@@@@@@@@@%#%%%%%%%%%%....@@ %%%%%%#/%%##%%%%%%%%%%/&&%%#
&&&&&&&&@@@@@@@@@@@%%%%%%%%%%%%%....@@@ %%%#%%((#%%%%%%%%%%%%%#(&///
(&&&&&@@@@@@@@@@@@%%%%%%%%%%%%%%.,..@@@..*%#%%((((#%%%%%%%%%%%%%%&##/
&&&&&&&@@@@@@@@@@@###%%%%%%%%%%%(...@@@ ..%%%%%((((#%%%%%%%%%%%%%&(##
&&&&&&@@@@@@@@@@@@%#%%%%%%%%%%%%%,..@@@@..&%%%%%((((%%%%%%%%%%%%&&&%%
&&&&&&@@@@@@@@@@@%%#%##%%%%%%%%%&,..@@@@.,,%%#%%(##((%%%%%%%%(@@&&&&&
&&&&&&&@@@@@@@@@@%%#%#%%%%%%#%%%%,, @@@@(,%%%%/(%###%#&@@@@@@@@&&&&&%
&&&&&&&@@@@@@@@@@%%%##%%%%%%%%%%%.,.@@#&@%&&%%#%%##%%%#@@@&@@@@@&&&&%
&&&&&&&&@@@@@@@@%%%%%%%%%%%%%#%%%./**,*,,%,(%#%%%%#%%##%@@@&@@@@&&&&%
&&&&&&&&@@@@@@@@%%%%%#%%%%%%%##%/**,,,,,,*,,%%##%%%#%%##@@@@@@@&&&&&&