| // Copyright 2015 The Go Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style |
| // license that can be found in the LICENSE file. |
| |
| package bidi |
| |
| import ( |
| "fmt" |
| "log" |
| ) |
| |
| // This implementation is a port based on the reference implementation found at: |
| // https://www.unicode.org/Public/PROGRAMS/BidiReferenceJava/ |
| // |
| // described in Unicode Bidirectional Algorithm (UAX #9). |
| // |
| // Input: |
| // There are two levels of input to the algorithm, since clients may prefer to |
| // supply some information from out-of-band sources rather than relying on the |
| // default behavior. |
| // |
| // - Bidi class array |
| // - Bidi class array, with externally supplied base line direction |
| // |
| // Output: |
| // Output is separated into several stages: |
| // |
| // - levels array over entire paragraph |
| // - reordering array over entire paragraph |
| // - levels array over line |
| // - reordering array over line |
| // |
| // Note that for conformance to the Unicode Bidirectional Algorithm, |
| // implementations are only required to generate correct reordering and |
| // character directionality (odd or even levels) over a line. Generating |
| // identical level arrays over a line is not required. Bidi explicit format |
| // codes (LRE, RLE, LRO, RLO, PDF) and BN can be assigned arbitrary levels and |
| // positions as long as the rest of the input is properly reordered. |
| // |
| // As the algorithm is defined to operate on a single paragraph at a time, this |
| // implementation is written to handle single paragraphs. Thus rule P1 is |
| // presumed by this implementation-- the data provided to the implementation is |
| // assumed to be a single paragraph, and either contains no 'B' codes, or a |
| // single 'B' code at the end of the input. 'B' is allowed as input to |
| // illustrate how the algorithm assigns it a level. |
| // |
| // Also note that rules L3 and L4 depend on the rendering engine that uses the |
| // result of the bidi algorithm. This implementation assumes that the rendering |
| // engine expects combining marks in visual order (e.g. to the left of their |
| // base character in RTL runs) and that it adjusts the glyphs used to render |
| // mirrored characters that are in RTL runs so that they render appropriately. |
| |
| // level is the embedding level of a character. Even embedding levels indicate |
| // left-to-right order and odd levels indicate right-to-left order. The special |
| // level of -1 is reserved for undefined order. |
| type level int8 |
| |
| const implicitLevel level = -1 |
| |
| // in returns if x is equal to any of the values in set. |
| func (c Class) in(set ...Class) bool { |
| for _, s := range set { |
| if c == s { |
| return true |
| } |
| } |
| return false |
| } |
| |
| // A paragraph contains the state of a paragraph. |
| type paragraph struct { |
| initialTypes []Class |
| |
| // Arrays of properties needed for paired bracket evaluation in N0 |
| pairTypes []bracketType // paired Bracket types for paragraph |
| pairValues []rune // rune for opening bracket or pbOpen and pbClose; 0 for pbNone |
| |
| embeddingLevel level // default: = implicitLevel; |
| |
| // at the paragraph levels |
| resultTypes []Class |
| resultLevels []level |
| |
| // Index of matching PDI for isolate initiator characters. For other |
| // characters, the value of matchingPDI will be set to -1. For isolate |
| // initiators with no matching PDI, matchingPDI will be set to the length of |
| // the input string. |
| matchingPDI []int |
| |
| // Index of matching isolate initiator for PDI characters. For other |
| // characters, and for PDIs with no matching isolate initiator, the value of |
| // matchingIsolateInitiator will be set to -1. |
| matchingIsolateInitiator []int |
| } |
| |
| // newParagraph initializes a paragraph. The user needs to supply a few arrays |
| // corresponding to the preprocessed text input. The types correspond to the |
| // Unicode BiDi classes for each rune. pairTypes indicates the bracket type for |
| // each rune. pairValues provides a unique bracket class identifier for each |
| // rune (suggested is the rune of the open bracket for opening and matching |
| // close brackets, after normalization). The embedding levels are optional, but |
| // may be supplied to encode embedding levels of styled text. |
| func newParagraph(types []Class, pairTypes []bracketType, pairValues []rune, levels level) (*paragraph, error) { |
| var err error |
| if err = validateTypes(types); err != nil { |
| return nil, err |
| } |
| if err = validatePbTypes(pairTypes); err != nil { |
| return nil, err |
| } |
| if err = validatePbValues(pairValues, pairTypes); err != nil { |
| return nil, err |
| } |
| if err = validateParagraphEmbeddingLevel(levels); err != nil { |
| return nil, err |
| } |
| |
| p := ¶graph{ |
| initialTypes: append([]Class(nil), types...), |
| embeddingLevel: levels, |
| |
| pairTypes: pairTypes, |
| pairValues: pairValues, |
| |
| resultTypes: append([]Class(nil), types...), |
| } |
| p.run() |
| return p, nil |
| } |
| |
| func (p *paragraph) Len() int { return len(p.initialTypes) } |
| |
| // The algorithm. Does not include line-based processing (Rules L1, L2). |
| // These are applied later in the line-based phase of the algorithm. |
| func (p *paragraph) run() { |
| p.determineMatchingIsolates() |
| |
| // 1) determining the paragraph level |
| // Rule P1 is the requirement for entering this algorithm. |
| // Rules P2, P3. |
| // If no externally supplied paragraph embedding level, use default. |
| if p.embeddingLevel == implicitLevel { |
| p.embeddingLevel = p.determineParagraphEmbeddingLevel(0, p.Len()) |
| } |
| |
| // Initialize result levels to paragraph embedding level. |
| p.resultLevels = make([]level, p.Len()) |
| setLevels(p.resultLevels, p.embeddingLevel) |
| |
| // 2) Explicit levels and directions |
| // Rules X1-X8. |
| p.determineExplicitEmbeddingLevels() |
| |
| // Rule X9. |
| // We do not remove the embeddings, the overrides, the PDFs, and the BNs |
| // from the string explicitly. But they are not copied into isolating run |
| // sequences when they are created, so they are removed for all |
| // practical purposes. |
| |
| // Rule X10. |
| // Run remainder of algorithm one isolating run sequence at a time |
| for _, seq := range p.determineIsolatingRunSequences() { |
| // 3) resolving weak types |
| // Rules W1-W7. |
| seq.resolveWeakTypes() |
| |
| // 4a) resolving paired brackets |
| // Rule N0 |
| resolvePairedBrackets(seq) |
| |
| // 4b) resolving neutral types |
| // Rules N1-N3. |
| seq.resolveNeutralTypes() |
| |
| // 5) resolving implicit embedding levels |
| // Rules I1, I2. |
| seq.resolveImplicitLevels() |
| |
| // Apply the computed levels and types |
| seq.applyLevelsAndTypes() |
| } |
| |
| // Assign appropriate levels to 'hide' LREs, RLEs, LROs, RLOs, PDFs, and |
| // BNs. This is for convenience, so the resulting level array will have |
| // a value for every character. |
| p.assignLevelsToCharactersRemovedByX9() |
| } |
| |
| // determineMatchingIsolates determines the matching PDI for each isolate |
| // initiator and vice versa. |
| // |
| // Definition BD9. |
| // |
| // At the end of this function: |
| // |
| // - The member variable matchingPDI is set to point to the index of the |
| // matching PDI character for each isolate initiator character. If there is |
| // no matching PDI, it is set to the length of the input text. For other |
| // characters, it is set to -1. |
| // - The member variable matchingIsolateInitiator is set to point to the |
| // index of the matching isolate initiator character for each PDI character. |
| // If there is no matching isolate initiator, or the character is not a PDI, |
| // it is set to -1. |
| func (p *paragraph) determineMatchingIsolates() { |
| p.matchingPDI = make([]int, p.Len()) |
| p.matchingIsolateInitiator = make([]int, p.Len()) |
| |
| for i := range p.matchingIsolateInitiator { |
| p.matchingIsolateInitiator[i] = -1 |
| } |
| |
| for i := range p.matchingPDI { |
| p.matchingPDI[i] = -1 |
| |
| if t := p.resultTypes[i]; t.in(LRI, RLI, FSI) { |
| depthCounter := 1 |
| for j := i + 1; j < p.Len(); j++ { |
| if u := p.resultTypes[j]; u.in(LRI, RLI, FSI) { |
| depthCounter++ |
| } else if u == PDI { |
| if depthCounter--; depthCounter == 0 { |
| p.matchingPDI[i] = j |
| p.matchingIsolateInitiator[j] = i |
| break |
| } |
| } |
| } |
| if p.matchingPDI[i] == -1 { |
| p.matchingPDI[i] = p.Len() |
| } |
| } |
| } |
| } |
| |
| // determineParagraphEmbeddingLevel reports the resolved paragraph direction of |
| // the substring limited by the given range [start, end). |
| // |
| // Determines the paragraph level based on rules P2, P3. This is also used |
| // in rule X5c to find if an FSI should resolve to LRI or RLI. |
| func (p *paragraph) determineParagraphEmbeddingLevel(start, end int) level { |
| var strongType Class = unknownClass |
| |
| // Rule P2. |
| for i := start; i < end; i++ { |
| if t := p.resultTypes[i]; t.in(L, AL, R) { |
| strongType = t |
| break |
| } else if t.in(FSI, LRI, RLI) { |
| i = p.matchingPDI[i] // skip over to the matching PDI |
| if i > end { |
| log.Panic("assert (i <= end)") |
| } |
| } |
| } |
| // Rule P3. |
| switch strongType { |
| case unknownClass: // none found |
| // default embedding level when no strong types found is 0. |
| return 0 |
| case L: |
| return 0 |
| default: // AL, R |
| return 1 |
| } |
| } |
| |
| const maxDepth = 125 |
| |
| // This stack will store the embedding levels and override and isolated |
| // statuses |
| type directionalStatusStack struct { |
| stackCounter int |
| embeddingLevelStack [maxDepth + 1]level |
| overrideStatusStack [maxDepth + 1]Class |
| isolateStatusStack [maxDepth + 1]bool |
| } |
| |
| func (s *directionalStatusStack) empty() { s.stackCounter = 0 } |
| func (s *directionalStatusStack) pop() { s.stackCounter-- } |
| func (s *directionalStatusStack) depth() int { return s.stackCounter } |
| |
| func (s *directionalStatusStack) push(level level, overrideStatus Class, isolateStatus bool) { |
| s.embeddingLevelStack[s.stackCounter] = level |
| s.overrideStatusStack[s.stackCounter] = overrideStatus |
| s.isolateStatusStack[s.stackCounter] = isolateStatus |
| s.stackCounter++ |
| } |
| |
| func (s *directionalStatusStack) lastEmbeddingLevel() level { |
| return s.embeddingLevelStack[s.stackCounter-1] |
| } |
| |
| func (s *directionalStatusStack) lastDirectionalOverrideStatus() Class { |
| return s.overrideStatusStack[s.stackCounter-1] |
| } |
| |
| func (s *directionalStatusStack) lastDirectionalIsolateStatus() bool { |
| return s.isolateStatusStack[s.stackCounter-1] |
| } |
| |
| // Determine explicit levels using rules X1 - X8 |
| func (p *paragraph) determineExplicitEmbeddingLevels() { |
| var stack directionalStatusStack |
| var overflowIsolateCount, overflowEmbeddingCount, validIsolateCount int |
| |
| // Rule X1. |
| stack.push(p.embeddingLevel, ON, false) |
| |
| for i, t := range p.resultTypes { |
| // Rules X2, X3, X4, X5, X5a, X5b, X5c |
| switch t { |
| case RLE, LRE, RLO, LRO, RLI, LRI, FSI: |
| isIsolate := t.in(RLI, LRI, FSI) |
| isRTL := t.in(RLE, RLO, RLI) |
| |
| // override if this is an FSI that resolves to RLI |
| if t == FSI { |
| isRTL = (p.determineParagraphEmbeddingLevel(i+1, p.matchingPDI[i]) == 1) |
| } |
| if isIsolate { |
| p.resultLevels[i] = stack.lastEmbeddingLevel() |
| if stack.lastDirectionalOverrideStatus() != ON { |
| p.resultTypes[i] = stack.lastDirectionalOverrideStatus() |
| } |
| } |
| |
| var newLevel level |
| if isRTL { |
| // least greater odd |
| newLevel = (stack.lastEmbeddingLevel() + 1) | 1 |
| } else { |
| // least greater even |
| newLevel = (stack.lastEmbeddingLevel() + 2) &^ 1 |
| } |
| |
| if newLevel <= maxDepth && overflowIsolateCount == 0 && overflowEmbeddingCount == 0 { |
| if isIsolate { |
| validIsolateCount++ |
| } |
| // Push new embedding level, override status, and isolated |
| // status. |
| // No check for valid stack counter, since the level check |
| // suffices. |
| switch t { |
| case LRO: |
| stack.push(newLevel, L, isIsolate) |
| case RLO: |
| stack.push(newLevel, R, isIsolate) |
| default: |
| stack.push(newLevel, ON, isIsolate) |
| } |
| // Not really part of the spec |
| if !isIsolate { |
| p.resultLevels[i] = newLevel |
| } |
| } else { |
| // This is an invalid explicit formatting character, |
| // so apply the "Otherwise" part of rules X2-X5b. |
| if isIsolate { |
| overflowIsolateCount++ |
| } else { // !isIsolate |
| if overflowIsolateCount == 0 { |
| overflowEmbeddingCount++ |
| } |
| } |
| } |
| |
| // Rule X6a |
| case PDI: |
| if overflowIsolateCount > 0 { |
| overflowIsolateCount-- |
| } else if validIsolateCount == 0 { |
| // do nothing |
| } else { |
| overflowEmbeddingCount = 0 |
| for !stack.lastDirectionalIsolateStatus() { |
| stack.pop() |
| } |
| stack.pop() |
| validIsolateCount-- |
| } |
| p.resultLevels[i] = stack.lastEmbeddingLevel() |
| |
| // Rule X7 |
| case PDF: |
| // Not really part of the spec |
| p.resultLevels[i] = stack.lastEmbeddingLevel() |
| |
| if overflowIsolateCount > 0 { |
| // do nothing |
| } else if overflowEmbeddingCount > 0 { |
| overflowEmbeddingCount-- |
| } else if !stack.lastDirectionalIsolateStatus() && stack.depth() >= 2 { |
| stack.pop() |
| } |
| |
| case B: // paragraph separator. |
| // Rule X8. |
| |
| // These values are reset for clarity, in this implementation B |
| // can only occur as the last code in the array. |
| stack.empty() |
| overflowIsolateCount = 0 |
| overflowEmbeddingCount = 0 |
| validIsolateCount = 0 |
| p.resultLevels[i] = p.embeddingLevel |
| |
| default: |
| p.resultLevels[i] = stack.lastEmbeddingLevel() |
| if stack.lastDirectionalOverrideStatus() != ON { |
| p.resultTypes[i] = stack.lastDirectionalOverrideStatus() |
| } |
| } |
| } |
| } |
| |
| type isolatingRunSequence struct { |
| p *paragraph |
| |
| indexes []int // indexes to the original string |
| |
| types []Class // type of each character using the index |
| resolvedLevels []level // resolved levels after application of rules |
| level level |
| sos, eos Class |
| } |
| |
| func (i *isolatingRunSequence) Len() int { return len(i.indexes) } |
| |
| func maxLevel(a, b level) level { |
| if a > b { |
| return a |
| } |
| return b |
| } |
| |
| // Rule X10, second bullet: Determine the start-of-sequence (sos) and end-of-sequence (eos) types, |
| // either L or R, for each isolating run sequence. |
| func (p *paragraph) isolatingRunSequence(indexes []int) *isolatingRunSequence { |
| length := len(indexes) |
| types := make([]Class, length) |
| for i, x := range indexes { |
| types[i] = p.resultTypes[x] |
| } |
| |
| // assign level, sos and eos |
| prevChar := indexes[0] - 1 |
| for prevChar >= 0 && isRemovedByX9(p.initialTypes[prevChar]) { |
| prevChar-- |
| } |
| prevLevel := p.embeddingLevel |
| if prevChar >= 0 { |
| prevLevel = p.resultLevels[prevChar] |
| } |
| |
| var succLevel level |
| lastType := types[length-1] |
| if lastType.in(LRI, RLI, FSI) { |
| succLevel = p.embeddingLevel |
| } else { |
| // the first character after the end of run sequence |
| limit := indexes[length-1] + 1 |
| for ; limit < p.Len() && isRemovedByX9(p.initialTypes[limit]); limit++ { |
| |
| } |
| succLevel = p.embeddingLevel |
| if limit < p.Len() { |
| succLevel = p.resultLevels[limit] |
| } |
| } |
| level := p.resultLevels[indexes[0]] |
| return &isolatingRunSequence{ |
| p: p, |
| indexes: indexes, |
| types: types, |
| level: level, |
| sos: typeForLevel(maxLevel(prevLevel, level)), |
| eos: typeForLevel(maxLevel(succLevel, level)), |
| } |
| } |
| |
| // Resolving weak types Rules W1-W7. |
| // |
| // Note that some weak types (EN, AN) remain after this processing is |
| // complete. |
| func (s *isolatingRunSequence) resolveWeakTypes() { |
| |
| // on entry, only these types remain |
| s.assertOnly(L, R, AL, EN, ES, ET, AN, CS, B, S, WS, ON, NSM, LRI, RLI, FSI, PDI) |
| |
| // Rule W1. |
| // Changes all NSMs. |
| precedingCharacterType := s.sos |
| for i, t := range s.types { |
| if t == NSM { |
| s.types[i] = precedingCharacterType |
| } else { |
| // if t.in(LRI, RLI, FSI, PDI) { |
| // precedingCharacterType = ON |
| // } |
| precedingCharacterType = t |
| } |
| } |
| |
| // Rule W2. |
| // EN does not change at the start of the run, because sos != AL. |
| for i, t := range s.types { |
| if t == EN { |
| for j := i - 1; j >= 0; j-- { |
| if t := s.types[j]; t.in(L, R, AL) { |
| if t == AL { |
| s.types[i] = AN |
| } |
| break |
| } |
| } |
| } |
| } |
| |
| // Rule W3. |
| for i, t := range s.types { |
| if t == AL { |
| s.types[i] = R |
| } |
| } |
| |
| // Rule W4. |
| // Since there must be values on both sides for this rule to have an |
| // effect, the scan skips the first and last value. |
| // |
| // Although the scan proceeds left to right, and changes the type |
| // values in a way that would appear to affect the computations |
| // later in the scan, there is actually no problem. A change in the |
| // current value can only affect the value to its immediate right, |
| // and only affect it if it is ES or CS. But the current value can |
| // only change if the value to its right is not ES or CS. Thus |
| // either the current value will not change, or its change will have |
| // no effect on the remainder of the analysis. |
| |
| for i := 1; i < s.Len()-1; i++ { |
| t := s.types[i] |
| if t == ES || t == CS { |
| prevSepType := s.types[i-1] |
| succSepType := s.types[i+1] |
| if prevSepType == EN && succSepType == EN { |
| s.types[i] = EN |
| } else if s.types[i] == CS && prevSepType == AN && succSepType == AN { |
| s.types[i] = AN |
| } |
| } |
| } |
| |
| // Rule W5. |
| for i, t := range s.types { |
| if t == ET { |
| // locate end of sequence |
| runStart := i |
| runEnd := s.findRunLimit(runStart, ET) |
| |
| // check values at ends of sequence |
| t := s.sos |
| if runStart > 0 { |
| t = s.types[runStart-1] |
| } |
| if t != EN { |
| t = s.eos |
| if runEnd < len(s.types) { |
| t = s.types[runEnd] |
| } |
| } |
| if t == EN { |
| setTypes(s.types[runStart:runEnd], EN) |
| } |
| // continue at end of sequence |
| i = runEnd |
| } |
| } |
| |
| // Rule W6. |
| for i, t := range s.types { |
| if t.in(ES, ET, CS) { |
| s.types[i] = ON |
| } |
| } |
| |
| // Rule W7. |
| for i, t := range s.types { |
| if t == EN { |
| // set default if we reach start of run |
| prevStrongType := s.sos |
| for j := i - 1; j >= 0; j-- { |
| t = s.types[j] |
| if t == L || t == R { // AL's have been changed to R |
| prevStrongType = t |
| break |
| } |
| } |
| if prevStrongType == L { |
| s.types[i] = L |
| } |
| } |
| } |
| } |
| |
| // 6) resolving neutral types Rules N1-N2. |
| func (s *isolatingRunSequence) resolveNeutralTypes() { |
| |
| // on entry, only these types can be in resultTypes |
| s.assertOnly(L, R, EN, AN, B, S, WS, ON, RLI, LRI, FSI, PDI) |
| |
| for i, t := range s.types { |
| switch t { |
| case WS, ON, B, S, RLI, LRI, FSI, PDI: |
| // find bounds of run of neutrals |
| runStart := i |
| runEnd := s.findRunLimit(runStart, B, S, WS, ON, RLI, LRI, FSI, PDI) |
| |
| // determine effective types at ends of run |
| var leadType, trailType Class |
| |
| // Note that the character found can only be L, R, AN, or |
| // EN. |
| if runStart == 0 { |
| leadType = s.sos |
| } else { |
| leadType = s.types[runStart-1] |
| if leadType.in(AN, EN) { |
| leadType = R |
| } |
| } |
| if runEnd == len(s.types) { |
| trailType = s.eos |
| } else { |
| trailType = s.types[runEnd] |
| if trailType.in(AN, EN) { |
| trailType = R |
| } |
| } |
| |
| var resolvedType Class |
| if leadType == trailType { |
| // Rule N1. |
| resolvedType = leadType |
| } else { |
| // Rule N2. |
| // Notice the embedding level of the run is used, not |
| // the paragraph embedding level. |
| resolvedType = typeForLevel(s.level) |
| } |
| |
| setTypes(s.types[runStart:runEnd], resolvedType) |
| |
| // skip over run of (former) neutrals |
| i = runEnd |
| } |
| } |
| } |
| |
| func setLevels(levels []level, newLevel level) { |
| for i := range levels { |
| levels[i] = newLevel |
| } |
| } |
| |
| func setTypes(types []Class, newType Class) { |
| for i := range types { |
| types[i] = newType |
| } |
| } |
| |
| // 7) resolving implicit embedding levels Rules I1, I2. |
| func (s *isolatingRunSequence) resolveImplicitLevels() { |
| |
| // on entry, only these types can be in resultTypes |
| s.assertOnly(L, R, EN, AN) |
| |
| s.resolvedLevels = make([]level, len(s.types)) |
| setLevels(s.resolvedLevels, s.level) |
| |
| if (s.level & 1) == 0 { // even level |
| for i, t := range s.types { |
| // Rule I1. |
| if t == L { |
| // no change |
| } else if t == R { |
| s.resolvedLevels[i] += 1 |
| } else { // t == AN || t == EN |
| s.resolvedLevels[i] += 2 |
| } |
| } |
| } else { // odd level |
| for i, t := range s.types { |
| // Rule I2. |
| if t == R { |
| // no change |
| } else { // t == L || t == AN || t == EN |
| s.resolvedLevels[i] += 1 |
| } |
| } |
| } |
| } |
| |
| // Applies the levels and types resolved in rules W1-I2 to the |
| // resultLevels array. |
| func (s *isolatingRunSequence) applyLevelsAndTypes() { |
| for i, x := range s.indexes { |
| s.p.resultTypes[x] = s.types[i] |
| s.p.resultLevels[x] = s.resolvedLevels[i] |
| } |
| } |
| |
| // Return the limit of the run consisting only of the types in validSet |
| // starting at index. This checks the value at index, and will return |
| // index if that value is not in validSet. |
| func (s *isolatingRunSequence) findRunLimit(index int, validSet ...Class) int { |
| loop: |
| for ; index < len(s.types); index++ { |
| t := s.types[index] |
| for _, valid := range validSet { |
| if t == valid { |
| continue loop |
| } |
| } |
| return index // didn't find a match in validSet |
| } |
| return len(s.types) |
| } |
| |
| // Algorithm validation. Assert that all values in types are in the |
| // provided set. |
| func (s *isolatingRunSequence) assertOnly(codes ...Class) { |
| loop: |
| for i, t := range s.types { |
| for _, c := range codes { |
| if t == c { |
| continue loop |
| } |
| } |
| log.Panicf("invalid bidi code %v present in assertOnly at position %d", t, s.indexes[i]) |
| } |
| } |
| |
| // determineLevelRuns returns an array of level runs. Each level run is |
| // described as an array of indexes into the input string. |
| // |
| // Determines the level runs. Rule X9 will be applied in determining the |
| // runs, in the way that makes sure the characters that are supposed to be |
| // removed are not included in the runs. |
| func (p *paragraph) determineLevelRuns() [][]int { |
| run := []int{} |
| allRuns := [][]int{} |
| currentLevel := implicitLevel |
| |
| for i := range p.initialTypes { |
| if !isRemovedByX9(p.initialTypes[i]) { |
| if p.resultLevels[i] != currentLevel { |
| // we just encountered a new run; wrap up last run |
| if currentLevel >= 0 { // only wrap it up if there was a run |
| allRuns = append(allRuns, run) |
| run = nil |
| } |
| // Start new run |
| currentLevel = p.resultLevels[i] |
| } |
| run = append(run, i) |
| } |
| } |
| // Wrap up the final run, if any |
| if len(run) > 0 { |
| allRuns = append(allRuns, run) |
| } |
| return allRuns |
| } |
| |
| // Definition BD13. Determine isolating run sequences. |
| func (p *paragraph) determineIsolatingRunSequences() []*isolatingRunSequence { |
| levelRuns := p.determineLevelRuns() |
| |
| // Compute the run that each character belongs to |
| runForCharacter := make([]int, p.Len()) |
| for i, run := range levelRuns { |
| for _, index := range run { |
| runForCharacter[index] = i |
| } |
| } |
| |
| sequences := []*isolatingRunSequence{} |
| |
| var currentRunSequence []int |
| |
| for _, run := range levelRuns { |
| first := run[0] |
| if p.initialTypes[first] != PDI || p.matchingIsolateInitiator[first] == -1 { |
| currentRunSequence = nil |
| // int run = i; |
| for { |
| // Copy this level run into currentRunSequence |
| currentRunSequence = append(currentRunSequence, run...) |
| |
| last := currentRunSequence[len(currentRunSequence)-1] |
| lastT := p.initialTypes[last] |
| if lastT.in(LRI, RLI, FSI) && p.matchingPDI[last] != p.Len() { |
| run = levelRuns[runForCharacter[p.matchingPDI[last]]] |
| } else { |
| break |
| } |
| } |
| sequences = append(sequences, p.isolatingRunSequence(currentRunSequence)) |
| } |
| } |
| return sequences |
| } |
| |
| // Assign level information to characters removed by rule X9. This is for |
| // ease of relating the level information to the original input data. Note |
| // that the levels assigned to these codes are arbitrary, they're chosen so |
| // as to avoid breaking level runs. |
| func (p *paragraph) assignLevelsToCharactersRemovedByX9() { |
| for i, t := range p.initialTypes { |
| if t.in(LRE, RLE, LRO, RLO, PDF, BN) { |
| p.resultTypes[i] = t |
| p.resultLevels[i] = -1 |
| } |
| } |
| // now propagate forward the levels information (could have |
| // propagated backward, the main thing is not to introduce a level |
| // break where one doesn't already exist). |
| |
| if p.resultLevels[0] == -1 { |
| p.resultLevels[0] = p.embeddingLevel |
| } |
| for i := 1; i < len(p.initialTypes); i++ { |
| if p.resultLevels[i] == -1 { |
| p.resultLevels[i] = p.resultLevels[i-1] |
| } |
| } |
| // Embedding information is for informational purposes only so need not be |
| // adjusted. |
| } |
| |
| // |
| // Output |
| // |
| |
| // getLevels computes levels array breaking lines at offsets in linebreaks. |
| // Rule L1. |
| // |
| // The linebreaks array must include at least one value. The values must be |
| // in strictly increasing order (no duplicates) between 1 and the length of |
| // the text, inclusive. The last value must be the length of the text. |
| func (p *paragraph) getLevels(linebreaks []int) []level { |
| // Note that since the previous processing has removed all |
| // P, S, and WS values from resultTypes, the values referred to |
| // in these rules are the initial types, before any processing |
| // has been applied (including processing of overrides). |
| // |
| // This example implementation has reinserted explicit format codes |
| // and BN, in order that the levels array correspond to the |
| // initial text. Their final placement is not normative. |
| // These codes are treated like WS in this implementation, |
| // so they don't interrupt sequences of WS. |
| |
| validateLineBreaks(linebreaks, p.Len()) |
| |
| result := append([]level(nil), p.resultLevels...) |
| |
| // don't worry about linebreaks since if there is a break within |
| // a series of WS values preceding S, the linebreak itself |
| // causes the reset. |
| for i, t := range p.initialTypes { |
| if t.in(B, S) { |
| // Rule L1, clauses one and two. |
| result[i] = p.embeddingLevel |
| |
| // Rule L1, clause three. |
| for j := i - 1; j >= 0; j-- { |
| if isWhitespace(p.initialTypes[j]) { // including format codes |
| result[j] = p.embeddingLevel |
| } else { |
| break |
| } |
| } |
| } |
| } |
| |
| // Rule L1, clause four. |
| start := 0 |
| for _, limit := range linebreaks { |
| for j := limit - 1; j >= start; j-- { |
| if isWhitespace(p.initialTypes[j]) { // including format codes |
| result[j] = p.embeddingLevel |
| } else { |
| break |
| } |
| } |
| start = limit |
| } |
| |
| return result |
| } |
| |
| // getReordering returns the reordering of lines from a visual index to a |
| // logical index for line breaks at the given offsets. |
| // |
| // Lines are concatenated from left to right. So for example, the fifth |
| // character from the left on the third line is |
| // |
| // getReordering(linebreaks)[linebreaks[1] + 4] |
| // |
| // (linebreaks[1] is the position after the last character of the second |
| // line, which is also the index of the first character on the third line, |
| // and adding four gets the fifth character from the left). |
| // |
| // The linebreaks array must include at least one value. The values must be |
| // in strictly increasing order (no duplicates) between 1 and the length of |
| // the text, inclusive. The last value must be the length of the text. |
| func (p *paragraph) getReordering(linebreaks []int) []int { |
| validateLineBreaks(linebreaks, p.Len()) |
| |
| return computeMultilineReordering(p.getLevels(linebreaks), linebreaks) |
| } |
| |
| // Return multiline reordering array for a given level array. Reordering |
| // does not occur across a line break. |
| func computeMultilineReordering(levels []level, linebreaks []int) []int { |
| result := make([]int, len(levels)) |
| |
| start := 0 |
| for _, limit := range linebreaks { |
| tempLevels := make([]level, limit-start) |
| copy(tempLevels, levels[start:]) |
| |
| for j, order := range computeReordering(tempLevels) { |
| result[start+j] = order + start |
| } |
| start = limit |
| } |
| return result |
| } |
| |
| // Return reordering array for a given level array. This reorders a single |
| // line. The reordering is a visual to logical map. For example, the |
| // leftmost char is string.charAt(order[0]). Rule L2. |
| func computeReordering(levels []level) []int { |
| result := make([]int, len(levels)) |
| // initialize order |
| for i := range result { |
| result[i] = i |
| } |
| |
| // locate highest level found on line. |
| // Note the rules say text, but no reordering across line bounds is |
| // performed, so this is sufficient. |
| highestLevel := level(0) |
| lowestOddLevel := level(maxDepth + 2) |
| for _, level := range levels { |
| if level > highestLevel { |
| highestLevel = level |
| } |
| if level&1 != 0 && level < lowestOddLevel { |
| lowestOddLevel = level |
| } |
| } |
| |
| for level := highestLevel; level >= lowestOddLevel; level-- { |
| for i := 0; i < len(levels); i++ { |
| if levels[i] >= level { |
| // find range of text at or above this level |
| start := i |
| limit := i + 1 |
| for limit < len(levels) && levels[limit] >= level { |
| limit++ |
| } |
| |
| for j, k := start, limit-1; j < k; j, k = j+1, k-1 { |
| result[j], result[k] = result[k], result[j] |
| } |
| // skip to end of level run |
| i = limit |
| } |
| } |
| } |
| |
| return result |
| } |
| |
| // isWhitespace reports whether the type is considered a whitespace type for the |
| // line break rules. |
| func isWhitespace(c Class) bool { |
| switch c { |
| case LRE, RLE, LRO, RLO, PDF, LRI, RLI, FSI, PDI, BN, WS: |
| return true |
| } |
| return false |
| } |
| |
| // isRemovedByX9 reports whether the type is one of the types removed in X9. |
| func isRemovedByX9(c Class) bool { |
| switch c { |
| case LRE, RLE, LRO, RLO, PDF, BN: |
| return true |
| } |
| return false |
| } |
| |
| // typeForLevel reports the strong type (L or R) corresponding to the level. |
| func typeForLevel(level level) Class { |
| if (level & 0x1) == 0 { |
| return L |
| } |
| return R |
| } |
| |
| func validateTypes(types []Class) error { |
| if len(types) == 0 { |
| return fmt.Errorf("types is null") |
| } |
| for i, t := range types[:len(types)-1] { |
| if t == B { |
| return fmt.Errorf("B type before end of paragraph at index: %d", i) |
| } |
| } |
| return nil |
| } |
| |
| func validateParagraphEmbeddingLevel(embeddingLevel level) error { |
| if embeddingLevel != implicitLevel && |
| embeddingLevel != 0 && |
| embeddingLevel != 1 { |
| return fmt.Errorf("illegal paragraph embedding level: %d", embeddingLevel) |
| } |
| return nil |
| } |
| |
| func validateLineBreaks(linebreaks []int, textLength int) error { |
| prev := 0 |
| for i, next := range linebreaks { |
| if next <= prev { |
| return fmt.Errorf("bad linebreak: %d at index: %d", next, i) |
| } |
| prev = next |
| } |
| if prev != textLength { |
| return fmt.Errorf("last linebreak was %d, want %d", prev, textLength) |
| } |
| return nil |
| } |
| |
| func validatePbTypes(pairTypes []bracketType) error { |
| if len(pairTypes) == 0 { |
| return fmt.Errorf("pairTypes is null") |
| } |
| for i, pt := range pairTypes { |
| switch pt { |
| case bpNone, bpOpen, bpClose: |
| default: |
| return fmt.Errorf("illegal pairType value at %d: %v", i, pairTypes[i]) |
| } |
| } |
| return nil |
| } |
| |
| func validatePbValues(pairValues []rune, pairTypes []bracketType) error { |
| if pairValues == nil { |
| return fmt.Errorf("pairValues is null") |
| } |
| if len(pairTypes) != len(pairValues) { |
| return fmt.Errorf("pairTypes is different length from pairValues") |
| } |
| return nil |
| } |