Following on from my previous TTS post using Chatterbox Turbo (or any other TTS like Kokoro or Qwen3-TTT), here is code to combine audio files in a single .m4b audiobook retaining author and title metadata, chapter markers, and a cover image. Mostly this is a Python wrapper to FFmpeg.
Again, disclaimers: don’t download and run. Code may not be safe.
The prequisite is an update version of Python and ffmpeg along with ffprobe. One method is via Homebrew, brew install python ffmpeg.
- title and author information is read from the
.epubbutffprobeis used to get the durations of each of the individual audio files, in order to create chapter breaks. ffmpegcombines individual audio files into a.m4b, converting toaacin the process.- the Python
zipfilelibrary extracts the first image from the.epuband then, ffmpegis used again to add this image, assumed to be the book cover, to the metadata.
# audiobook.py - Make .m4b audiobook from individual .m4a/.mp3 TTS files
# (c) C.Y., mybyways.com
# v0.1 15 Feb 26
import os, sys, time, glob, zipfile, subprocess
from epub2text import EPUBParser
METADATA_FILE = 'metadata.txt'
CHAPTERS_FILE = 'chapters.txt'
FFPROBE_CMD = 'ffprobe -v error -show_entries format=duration -of default=noprint_wrappers=1:nokey=1 -loglevel quiet'
FFMPEG_CONCAT_CMD = 'ffmpeg -y -f concat -safe 0 -i "{chapters_file}" -c:a aac -vn -i "{metadata_file}" -map_metadata 1 "{temp_file}"'
FFMPEG_COVER_CMD = 'ffmpeg -i "{temp_file}" -i "{cover_file}" -c copy -disposition:v attached_pic "{audiobook_file}"'
INPUT_FORMAT = '.flac'
#https://stackoverflow.com/questions/6405208/how-to-convert-numeric-string-ranges-to-a-list-in-python
def stringrange_to_list(text):
return sum(((list(range(*[int(j) + k for k,j in enumerate(i.split('-'))]))
if '-' in i else [int(i)]) for i in text.split(',')), [])
def start_timer():
return time.perf_counter()
def stop_timer(tic):
toc = time.perf_counter()
tic = toc - tic
min, sec = divmod(tic, 60)
return tic, int(min), int(sec)
def list_epub_chapters(epub_file):
epub = EPUBParser(epub_file)
metadata = epub.get_metadata()
print(f'''Book: {epub_file}
Title: {metadata.title}
Author(s): {", ".join(metadata.authors)}''')
for c, chapter in enumerate(epub.get_chapters()):
print(f' {c+1:>3}. {chapter.title}: {chapter.char_count:,} characters')
def save_metadata(epub_file, audio_dir, chapter_range):
epub = EPUBParser(epub_file)
metadata = epub.get_metadata()
chapters = epub.get_chapters()
metadata_file = os.path.join(audio_dir, METADATA_FILE)
chapters_file = os.path.join(audio_dir, CHAPTERS_FILE)
with open(metadata_file, 'w') as meta, open(chapters_file, 'w') as chap:
meta.write(f''';FFMETADATA1
title={metadata.title}
artist={", ".join(metadata.authors)}
''')
start = 0
for i, c in enumerate(chapter_range):
audio_file = os.path.join(audio_dir, f'{c:03}{INPUT_FORMAT}')
if os.path.isfile(audio_file):
chap.write(f"file '{c:03}{INPUT_FORMAT}'\n")
result = subprocess.run(f'{FFPROBE_CMD} "{audio_file}"', shell=True, stdout=subprocess.PIPE)
end = start + int(float(result.stdout) * 1000)
j = i if chapter_range == None else chapter_range[i]-1
meta.write(f'''[CHAPTER]
TIMEBASE=1/1000
START={start}
END={end}
title={i+1}. {chapters[c-1].title}
''')
start = end
return metadata_file, chapters_file
def merge_audiobook(audio_dir, metadata_file, chapters_file):
temp_file = os.path.join(audio_dir, 'output.m4b')
result = subprocess.run(FFMPEG_CONCAT_CMD.format(metadata_file=metadata_file, chapters_file=chapters_file, temp_file=temp_file), shell=True, stdout=subprocess.PIPE)
if result.returncode == 0:
return temp_file
# Save first jpg/png image as cover and add to .m4b in same directory as .epub
def save_cover(epub_file, audio_dir, temp_file):
with zipfile.ZipFile(epub_file, mode='r') as zip:
for info in zip.infolist():
if info.filename.lower().endswith(('.jpg', '.jpeg', '.png')):
result = zip.extract(info, audio_dir)
audiobook_file = f'{os.path.splitext(os.path.basename(epub_file))[0]}.m4b'
result = subprocess.run(FFMPEG_COVER_CMD.format(cover_file=result, temp_file=temp_file, audiobook_file=audiobook_file), shell=True, stdout=subprocess.PIPE)
return audiobook_file
match len(sys.argv):
case 2:
epub_file = sys.argv[1]
if os.path.isfile(epub_file) and epub_file.lower().endswith('.epub'):
list_epub_chapters(epub_file)
case 3:
epub_file = sys.argv[1]
chapter_range = sys.argv[2]
if os.path.isfile(epub_file) and epub_file.lower().endswith('.epub'):
audio_dir = os.path.splitext(os.path.basename(epub_file))[0]
if os.path.isdir(audio_dir):
chapter_range = stringrange_to_list(chapter_range)
tic = start_timer()
metadata_file, chapters_file = save_metadata(epub_file, audio_dir, chapter_range)
temp_file = merge_audiobook(audio_dir, metadata_file, chapters_file)
if temp_file:
audiobook_file = save_cover(epub_file, audio_dir, temp_file)
tic, min, sec = stop_timer(tic)
print(f'Completed to {audiobook_file} in {min:,} minutes {sec} seconds')
case _:
cmd = sys.argv[0]
print(f'''{cmd} epub_file - list .epub chapters (file must exist)
{cmd} epub_file chapter_range - merge audio files into single .m4b for given chapters''')
Notes:
- Cover image extraction is very dumb - just assume the fist image is the cover...
- Edit the global variables as needed - these many not be the optimal
ffmpegorffprobecommands but they work for me. - Creates a temporary directory following the
.epubfilename, but saves the final.m4balongside the.epub. Assuming all goes well, feel free to delete the temporary directory once the audiobook is generated. - I updated my previous Chatterbox Turbo TTS code to generate
.flacbut the audiobook will be.aacformat.
Usage is similar to previous code:
- running
python audiobook.py book.epubwill list the chapters of an.epubbook. - running
python audiobook.py book.epub 1,4-6will combine chapter 1, 4, 5 and 6 into an audiobook.