1 /* 2 * Copyright 2006-2007, Haiku, Inc. All Rights Reserved. 3 * Distributed under the terms of the MIT License. 4 * 5 * Authors: 6 * Andrew Galante, haiku.galante@gmail.com 7 * Axel Dörfler, axeld@pinc-software.de 8 * Hugo Santos, hugosantos@gmail.com 9 */ 10 11 12 #include "TCPEndpoint.h" 13 #include "EndpointManager.h" 14 15 #include <net_buffer.h> 16 #include <net_datalink.h> 17 #include <net_stat.h> 18 #include <NetBufferUtilities.h> 19 #include <NetUtilities.h> 20 21 #include <lock.h> 22 #include <util/AutoLock.h> 23 #include <util/khash.h> 24 #include <util/list.h> 25 26 #include <KernelExport.h> 27 28 #include <netinet/in.h> 29 #include <netinet/ip.h> 30 #include <netinet/tcp.h> 31 #include <new> 32 #include <stdlib.h> 33 #include <string.h> 34 35 36 // References: 37 // - RFC 793 - Transmission Control Protocol 38 // - RFC 813 - Window and Acknowledgement Strategy in TCP 39 // 40 // Things this implementation currently doesn't implement: 41 // 42 // TCP Slow Start, Congestion Avoidance, Fast Retransmit, and Fast Recovery, RFC 2001, RFC 2581, RFC 3042 43 // NewReno Modification to TCP's Fast Recovery, RFC 2582 44 // Explicit Congestion Notification (ECN), RFC 3168 45 // SYN-Cache 46 // TCP Extensions for High Performance, RFC 1323 47 // SACK, Selective Acknowledgment - RFC 2018, RFC 2883, RFC 3517 48 // Forward RTO-Recovery, RFC 4138 49 50 #define PrintAddress(address) \ 51 AddressString(Domain(), (const sockaddr *)(address), true).Data() 52 53 //#define TRACE_TCP 54 //#define PROBE_TCP 55 56 #ifdef TRACE_TCP 57 // the space before ', ##args' is important in order for this to work with cpp 2.95 58 # define TRACE(format, args...) dprintf("TCP [%llu] %p (%12s) " format "\n", \ 59 system_time(), this, name_for_state(fState) , ##args) 60 #else 61 # define TRACE(args...) do { } while (0) 62 #endif 63 64 #ifdef PROBE_TCP 65 # define PROBE(buffer, window) \ 66 dprintf("TCP PROBE %llu %s %s %ld %lu %lu %lu %lu %lu %lu %lu %lu %lu %llu\n", \ 67 system_time(), PrintAddress(&buffer->source), \ 68 PrintAddress(&buffer->destination), buffer->size, (uint32)fSendNext, \ 69 (uint32)fSendUnacknowledged, fCongestionWindow, fSlowStartThreshold, \ 70 window, fSendWindow, (uint32)(fSendMax - fSendUnacknowledged), \ 71 fSendQueue.Available(fSendNext), fSendQueue.Used(), fRetransmitTimeout) 72 #else 73 # define PROBE(buffer, window) do { } while (0) 74 #endif 75 76 // Initial estimate for packet round trip time (RTT) 77 #define TCP_INITIAL_RTT 2000000 78 79 // constants for the fFlags field 80 enum { 81 FLAG_OPTION_WINDOW_SCALE = 0x01, 82 FLAG_OPTION_TIMESTAMP = 0x02, 83 // TODO: Should FLAG_NO_RECEIVE apply as well to received connections? 84 // That is, what is expected from accept() after a shutdown() 85 // is performed on a listen()ing socket. 86 FLAG_NO_RECEIVE = 0x04, 87 }; 88 89 90 static const int kTimestampFactor = 1024; 91 92 93 static inline bigtime_t 94 absolute_timeout(bigtime_t timeout) 95 { 96 if (timeout == 0 || timeout == B_INFINITE_TIMEOUT) 97 return timeout; 98 99 return timeout + system_time(); 100 } 101 102 103 static inline status_t 104 posix_error(status_t error) 105 { 106 if (error == B_TIMED_OUT) 107 return B_WOULD_BLOCK; 108 109 return error; 110 } 111 112 113 static inline bool 114 in_window(const tcp_sequence &sequence, const tcp_sequence &rcvNext, 115 uint32 rcvWindow) 116 { 117 return sequence >= rcvNext && sequence < (rcvNext + rcvWindow); 118 } 119 120 121 static inline bool 122 segment_in_sequence(const tcp_segment_header &segment, int size, 123 const tcp_sequence &rcvNext, uint32 rcvWindow) 124 { 125 tcp_sequence sequence(segment.sequence); 126 if (size == 0) { 127 if (rcvWindow == 0) 128 return sequence == rcvNext; 129 return in_window(sequence, rcvNext, rcvWindow); 130 } else { 131 if (rcvWindow == 0) 132 return false; 133 return in_window(sequence, rcvNext, rcvWindow) 134 || in_window(sequence + size - 1, rcvNext, rcvWindow); 135 } 136 } 137 138 139 static inline bool 140 is_writable(tcp_state state) 141 { 142 return state == SYNCHRONIZE_SENT || state == SYNCHRONIZE_RECEIVED 143 || state == ESTABLISHED || state == FINISH_RECEIVED; 144 } 145 146 147 static inline uint32 tcp_now() 148 { 149 return system_time() / kTimestampFactor; 150 } 151 152 153 static inline uint32 tcp_diff_timestamp(uint32 base) 154 { 155 uint32 now = tcp_now(); 156 157 if (now > base) 158 return now - base; 159 160 return now + UINT_MAX - base; 161 } 162 163 164 static inline bool 165 state_needs_finish(int32 state) 166 { 167 return state == WAIT_FOR_FINISH_ACKNOWLEDGE 168 || state == FINISH_SENT || state == CLOSING; 169 } 170 171 172 WaitList::WaitList(const char *name) 173 { 174 fCondition = 0; 175 fSem = create_sem(0, name); 176 } 177 178 179 WaitList::~WaitList() 180 { 181 delete_sem(fSem); 182 } 183 184 185 status_t 186 WaitList::InitCheck() const 187 { 188 return fSem; 189 } 190 191 192 status_t 193 WaitList::Wait(RecursiveLocker &locker, bigtime_t timeout, bool wakeNext) 194 { 195 locker.Unlock(); 196 197 status_t status = B_OK; 198 199 while (status == B_OK && !atomic_test_and_set(&fCondition, 0, 1)) 200 status = acquire_sem_etc(fSem, 1, B_ABSOLUTE_TIMEOUT | B_CAN_INTERRUPT, 201 timeout); 202 203 locker.Lock(); 204 if (status == B_OK && wakeNext) 205 Signal(); 206 207 return status; 208 } 209 210 211 void 212 WaitList::Signal() 213 { 214 atomic_or(&fCondition, 1); 215 release_sem_etc(fSem, 1, B_DO_NOT_RESCHEDULE | B_RELEASE_IF_WAITING_ONLY); 216 } 217 218 219 TCPEndpoint::TCPEndpoint(net_socket *socket) 220 : 221 ProtocolSocket(socket), 222 fManager(NULL), 223 fReceiveList("tcp receive"), 224 fSendList("tcp send"), 225 fOptions(0), 226 fSendWindowShift(0), 227 fReceiveWindowShift(0), 228 fSendUnacknowledged(0), 229 fSendNext(0), 230 fSendMax(0), 231 fSendWindow(0), 232 fSendMaxWindow(0), 233 fSendMaxSegmentSize(TCP_DEFAULT_MAX_SEGMENT_SIZE), 234 fSendQueue(socket->send.buffer_size), 235 fInitialSendSequence(0), 236 fDuplicateAcknowledgeCount(0), 237 fRoute(NULL), 238 fReceiveNext(0), 239 fReceiveMaxAdvertised(0), 240 fReceiveWindow(socket->receive.buffer_size), 241 fReceiveMaxSegmentSize(TCP_DEFAULT_MAX_SEGMENT_SIZE), 242 fReceiveQueue(socket->receive.buffer_size), 243 fRoundTripTime(TCP_INITIAL_RTT / kTimestampFactor), 244 fRoundTripDeviation(TCP_INITIAL_RTT / kTimestampFactor), 245 fRetransmitTimeout(TCP_INITIAL_RTT), 246 fReceivedTimestamp(0), 247 fCongestionWindow(0), 248 fSlowStartThreshold(0), 249 fState(CLOSED), 250 fFlags(FLAG_OPTION_WINDOW_SCALE | FLAG_OPTION_TIMESTAMP), 251 fError(B_OK) 252 { 253 //gStackModule->init_timer(&fTimer, _TimeWait, this); 254 255 recursive_lock_init(&fLock, "tcp lock"); 256 // TODO: to be replaced with a real locking strategy! 257 258 gStackModule->init_timer(&fPersistTimer, TCPEndpoint::_PersistTimer, this); 259 gStackModule->init_timer(&fRetransmitTimer, TCPEndpoint::_RetransmitTimer, this); 260 gStackModule->init_timer(&fDelayedAcknowledgeTimer, 261 TCPEndpoint::_DelayedAcknowledgeTimer, this); 262 gStackModule->init_timer(&fTimeWaitTimer, TCPEndpoint::_TimeWaitTimer, this); 263 } 264 265 266 TCPEndpoint::~TCPEndpoint() 267 { 268 recursive_lock_lock(&fLock); 269 270 gStackModule->cancel_timer(&fRetransmitTimer); 271 gStackModule->cancel_timer(&fPersistTimer); 272 gStackModule->cancel_timer(&fDelayedAcknowledgeTimer); 273 gStackModule->cancel_timer(&fTimeWaitTimer); 274 275 if (fManager) { 276 fManager->Unbind(this); 277 return_endpoint_manager(fManager); 278 } 279 280 recursive_lock_destroy(&fLock); 281 } 282 283 284 status_t 285 TCPEndpoint::InitCheck() const 286 { 287 if (fLock.sem < B_OK) 288 return fLock.sem; 289 290 if (fReceiveList.InitCheck() < B_OK) 291 return fReceiveList.InitCheck(); 292 293 if (fSendList.InitCheck() < B_OK) 294 return fSendList.InitCheck(); 295 296 return B_OK; 297 } 298 299 300 // #pragma mark - protocol API 301 302 303 status_t 304 TCPEndpoint::Open() 305 { 306 TRACE("Open()"); 307 308 status_t status = ProtocolSocket::Open(); 309 if (status < B_OK) 310 return status; 311 312 fManager = create_endpoint_manager(Domain()); 313 if (fManager == NULL) 314 return EAFNOSUPPORT; 315 316 return B_OK; 317 } 318 319 320 status_t 321 TCPEndpoint::Close() 322 { 323 TRACE("Close()"); 324 325 RecursiveLocker lock(fLock); 326 327 if (fState == LISTEN) 328 delete_sem(fAcceptSemaphore); 329 330 if (fState == SYNCHRONIZE_SENT || fState == LISTEN) { 331 fState = CLOSED; 332 return B_OK; 333 } 334 335 status_t status = _ShutdownEgress(true); 336 if (status != B_OK) 337 return status; 338 339 if (socket->options & SO_LINGER) { 340 TRACE("Close(): Lingering for %i secs", socket->linger); 341 342 bigtime_t maximum = absolute_timeout(socket->linger * 1000000LL); 343 344 while (fSendQueue.Used() > 0) { 345 status = fSendList.Wait(lock, maximum); 346 if (status == B_TIMED_OUT || status == B_WOULD_BLOCK) 347 break; 348 else if (status < B_OK) 349 return status; 350 } 351 352 TRACE("Close(): after waiting, the SendQ was left with %lu bytes.", 353 fSendQueue.Used()); 354 } 355 356 // TODO: do i need to wait until fState returns to CLOSED? 357 return B_OK; 358 } 359 360 361 status_t 362 TCPEndpoint::Free() 363 { 364 TRACE("Free()"); 365 366 RecursiveLocker _(fLock); 367 368 if (fState <= SYNCHRONIZE_SENT || fState == TIME_WAIT) 369 return B_OK; 370 371 // we are only interested in the timer, not in changing state 372 _EnterTimeWait(); 373 return B_BUSY; 374 // we'll be freed later when the 2MSL timer expires 375 } 376 377 378 /*! 379 Creates and sends a synchronize packet to /a address, and then waits 380 until the connection has been established or refused. 381 */ 382 status_t 383 TCPEndpoint::Connect(const sockaddr *address) 384 { 385 TRACE("Connect() on address %s", PrintAddress(address)); 386 387 RecursiveLocker locker(fLock); 388 389 // Can only call connect() from CLOSED or LISTEN states 390 // otherwise endpoint is considered already connected 391 if (fState == LISTEN) { 392 // this socket is about to connect; remove pending connections in the backlog 393 gSocketModule->set_max_backlog(socket, 0); 394 } else if (fState != CLOSED) 395 return EISCONN; 396 397 status_t status = _PrepareSendPath(address); 398 if (status < B_OK) 399 return status; 400 401 TRACE(" Connect(): starting 3-way handshake..."); 402 403 fState = SYNCHRONIZE_SENT; 404 405 // send SYN 406 status = _SendQueued(); 407 if (status != B_OK) { 408 fState = CLOSED; 409 return status; 410 } 411 412 // If we are running over Loopback, after _SendQueued() returns we 413 // may be in ESTABLISHED already. 414 if (fState == ESTABLISHED) { 415 TRACE(" Connect() completed after _SendQueued()"); 416 return B_OK; 417 } 418 419 // wait until 3-way handshake is complete (if needed) 420 bigtime_t timeout = min_c(socket->send.timeout, TCP_CONNECTION_TIMEOUT); 421 if (timeout == 0) { 422 // we're a non-blocking socket 423 TRACE(" Connect() delayed, return EINPROGRESS"); 424 return EINPROGRESS; 425 } 426 427 status = _WaitForEstablished(locker, absolute_timeout(timeout)); 428 TRACE(" Connect(): Connection complete: %s (timeout was %llu)", 429 strerror(status), timeout); 430 return posix_error(status); 431 } 432 433 434 status_t 435 TCPEndpoint::Accept(struct net_socket **_acceptedSocket) 436 { 437 TRACE("Accept()"); 438 439 RecursiveLocker locker(fLock); 440 441 status_t status; 442 bigtime_t timeout = absolute_timeout(socket->receive.timeout); 443 444 do { 445 locker.Unlock(); 446 447 status = acquire_sem_etc(fAcceptSemaphore, 1, B_ABSOLUTE_TIMEOUT 448 | B_CAN_INTERRUPT, timeout); 449 if (status < B_OK) 450 return status; 451 452 locker.Lock(); 453 status = gSocketModule->dequeue_connected(socket, _acceptedSocket); 454 if (status == B_OK) 455 TRACE(" Accept() returning %p", (*_acceptedSocket)->first_protocol); 456 } while (status < B_OK); 457 458 return status; 459 } 460 461 462 status_t 463 TCPEndpoint::Bind(const sockaddr *address) 464 { 465 if (address == NULL) 466 return B_BAD_VALUE; 467 468 RecursiveLocker lock(fLock); 469 470 TRACE("Bind() on address %s", PrintAddress(address)); 471 472 if (fState != CLOSED) 473 return EISCONN; 474 475 return fManager->Bind(this, address); 476 } 477 478 479 status_t 480 TCPEndpoint::Unbind(struct sockaddr *address) 481 { 482 TRACE("Unbind()"); 483 484 RecursiveLocker lock(fLock); 485 return fManager->Unbind(this); 486 } 487 488 489 status_t 490 TCPEndpoint::Listen(int count) 491 { 492 TRACE("Listen()"); 493 494 RecursiveLocker lock(fLock); 495 496 if (fState != CLOSED) 497 return B_BAD_VALUE; 498 499 fAcceptSemaphore = create_sem(0, "tcp accept"); 500 if (fAcceptSemaphore < B_OK) 501 return ENOBUFS; 502 503 status_t status = fManager->SetPassive(this); 504 if (status < B_OK) { 505 delete_sem(fAcceptSemaphore); 506 fAcceptSemaphore = -1; 507 return status; 508 } 509 510 fState = LISTEN; 511 return B_OK; 512 } 513 514 515 status_t 516 TCPEndpoint::Shutdown(int direction) 517 { 518 TRACE("Shutdown(%i)", direction); 519 520 RecursiveLocker lock(fLock); 521 522 if (direction == SHUT_RD || direction == SHUT_RDWR) 523 fFlags |= FLAG_NO_RECEIVE; 524 525 if (direction == SHUT_WR || direction == SHUT_RDWR) 526 _ShutdownEgress(false); 527 528 return B_OK; 529 } 530 531 532 /*! 533 Puts data contained in \a buffer into send buffer 534 */ 535 status_t 536 TCPEndpoint::SendData(net_buffer *buffer) 537 { 538 RecursiveLocker lock(fLock); 539 540 TRACE("SendData(buffer %p, size %lu, flags %lx) [total %lu bytes, has %lu]", 541 buffer, buffer->size, buffer->flags, fSendQueue.Size(), 542 fSendQueue.Free()); 543 544 if (fState == CLOSED) 545 return ENOTCONN; 546 else if (fState == LISTEN) { 547 return EDESTADDRREQ; 548 } else if (fState == FINISH_SENT || fState == FINISH_ACKNOWLEDGED 549 || fState == CLOSING || fState == WAIT_FOR_FINISH_ACKNOWLEDGE 550 || fState == TIME_WAIT) { 551 // TODO: send SIGPIPE signal to app? 552 return EPIPE; 553 } 554 555 if (buffer->size > 0) { 556 if (buffer->size > fSendQueue.Size()) 557 return EMSGSIZE; 558 559 bigtime_t timeout = absolute_timeout(socket->send.timeout); 560 561 while (fSendQueue.Free() < buffer->size) { 562 status_t status = fSendList.Wait(lock, timeout); 563 if (status < B_OK) { 564 TRACE(" SendData() returning %s (%d)", 565 strerror(posix_error(status)), (int)posix_error(status)); 566 return posix_error(status); 567 } 568 } 569 570 fSendQueue.Add(buffer); 571 } 572 573 TRACE(" SendData(): %lu bytes used.", fSendQueue.Used()); 574 575 if (fState == ESTABLISHED || fState == FINISH_RECEIVED) 576 _SendQueued(); 577 578 return B_OK; 579 } 580 581 582 ssize_t 583 TCPEndpoint::SendAvailable() 584 { 585 RecursiveLocker locker(fLock); 586 587 ssize_t available; 588 589 if (is_writable(fState)) 590 available = fSendQueue.Free(); 591 else 592 available = EPIPE; 593 594 TRACE("SendAvailable(): %li", available); 595 return available; 596 } 597 598 599 status_t 600 TCPEndpoint::FillStat(net_stat *stat) 601 { 602 RecursiveLocker _(fLock); 603 604 strlcpy(stat->state, name_for_state(fState), sizeof(stat->state)); 605 stat->receive_queue_size = fReceiveQueue.Available(); 606 stat->send_queue_size = fSendQueue.Used(); 607 608 return B_OK; 609 } 610 611 612 status_t 613 TCPEndpoint::ReadData(size_t numBytes, uint32 flags, net_buffer** _buffer) 614 { 615 TRACE("ReadData(%lu bytes, flags 0x%x)", numBytes, (unsigned int)flags); 616 617 RecursiveLocker locker(fLock); 618 619 *_buffer = NULL; 620 621 if (fState == CLOSED) 622 return ENOTCONN; 623 624 bigtime_t timeout = absolute_timeout(socket->receive.timeout); 625 626 if (fState == SYNCHRONIZE_SENT || fState == SYNCHRONIZE_RECEIVED) { 627 if (flags & MSG_DONTWAIT) 628 return B_WOULD_BLOCK; 629 630 status_t status = _WaitForEstablished(locker, timeout); 631 if (status < B_OK) 632 return posix_error(status); 633 } 634 635 size_t dataNeeded = socket->receive.low_water_mark; 636 637 // When MSG_WAITALL is set then the function should block 638 // until the full amount of data can be returned. 639 if (flags & MSG_WAITALL) 640 dataNeeded = numBytes; 641 642 // TODO: add support for urgent data (MSG_OOB) 643 644 while (true) { 645 if (fState == CLOSING || fState == WAIT_FOR_FINISH_ACKNOWLEDGE 646 || fState == TIME_WAIT) { 647 // ``Connection closing''. 648 return B_OK; 649 } 650 651 if (fReceiveQueue.Available() >= dataNeeded || 652 ((fReceiveQueue.PushedData() > 0) 653 && (fReceiveQueue.PushedData() >= fReceiveQueue.Available()))) 654 break; 655 656 if (fState == FINISH_RECEIVED) { 657 // ``If no text is awaiting delivery, the RECEIVE will 658 // get a Connection closing''. 659 return B_OK; 660 } 661 662 if (flags & MSG_DONTWAIT) 663 return B_WOULD_BLOCK; 664 665 status_t status = fReceiveList.Wait(locker, timeout, false); 666 if (status < B_OK) { 667 // The Open Group base specification mentions that EINTR should be 668 // returned if the recv() is interrupted before _any data_ is 669 // available. So we actually check if there is data, and if so, 670 // push it to the user. 671 if ((status == B_TIMED_OUT || status == B_INTERRUPTED) 672 && fReceiveQueue.Available() > 0) 673 break; 674 675 return posix_error(status); 676 } 677 } 678 679 TRACE(" ReadData(): %lu are available.", fReceiveQueue.Available()); 680 681 if (numBytes < fReceiveQueue.Available()) 682 fReceiveList.Signal(); 683 684 bool clone = (flags & MSG_PEEK); 685 686 ssize_t receivedBytes = fReceiveQueue.Get(numBytes, !clone, _buffer); 687 688 TRACE(" ReadData(): %lu bytes kept.", fReceiveQueue.Available()); 689 690 // if we are opening the window, check if we should send an ACK 691 if (!clone) 692 SendAcknowledge(false); 693 694 return receivedBytes; 695 } 696 697 698 ssize_t 699 TCPEndpoint::ReadAvailable() 700 { 701 RecursiveLocker locker(fLock); 702 703 TRACE("ReadAvailable(): %li", _AvailableData()); 704 705 return _AvailableData(); 706 } 707 708 709 status_t 710 TCPEndpoint::SetSendBufferSize(size_t length) 711 { 712 RecursiveLocker _(fLock); 713 fSendQueue.SetMaxBytes(length); 714 return B_OK; 715 } 716 717 718 status_t 719 TCPEndpoint::SetReceiveBufferSize(size_t length) 720 { 721 RecursiveLocker _(fLock); 722 fReceiveQueue.SetMaxBytes(length); 723 return B_OK; 724 } 725 726 727 // #pragma mark - misc 728 729 730 bool 731 TCPEndpoint::IsBound() const 732 { 733 return !LocalAddress().IsEmpty(true); 734 } 735 736 737 void 738 TCPEndpoint::DeleteSocket() 739 { 740 // the next call will delete `this'. 741 gSocketModule->delete_socket(socket); 742 } 743 744 745 status_t 746 TCPEndpoint::DelayedAcknowledge() 747 { 748 // if the timer is already running, and there is still more than 749 // half of the receive window free, just wait for the timer to expire 750 if (gStackModule->is_timer_active(&fDelayedAcknowledgeTimer) 751 && (fReceiveMaxAdvertised - fReceiveNext) > (socket->receive.buffer_size >> 1)) 752 return B_OK; 753 754 if (gStackModule->cancel_timer(&fDelayedAcknowledgeTimer)) { 755 // timer was active, send an ACK now (with the exception above, 756 // we send every other ACK) 757 return SendAcknowledge(true); 758 } 759 760 gStackModule->set_timer(&fDelayedAcknowledgeTimer, TCP_DELAYED_ACKNOWLEDGE_TIMEOUT); 761 return B_OK; 762 } 763 764 765 status_t 766 TCPEndpoint::SendAcknowledge(bool force) 767 { 768 return _SendQueued(force, 0); 769 } 770 771 772 void 773 TCPEndpoint::_StartPersistTimer() 774 { 775 gStackModule->set_timer(&fPersistTimer, 1000000LL); 776 } 777 778 779 void 780 TCPEndpoint::_EnterTimeWait() 781 { 782 gStackModule->set_timer(&fTimeWaitTimer, TCP_MAX_SEGMENT_LIFETIME << 1); 783 } 784 785 786 status_t 787 TCPEndpoint::UpdateTimeWait() 788 { 789 return B_OK; 790 } 791 792 793 // #pragma mark - receive 794 795 796 int32 797 TCPEndpoint::_ListenReceive(tcp_segment_header &segment, net_buffer *buffer) 798 { 799 TRACE("ListenReceive()"); 800 801 // Essentially, we accept only TCP_FLAG_SYNCHRONIZE in this state, 802 // but the error behaviour differs 803 if (segment.flags & TCP_FLAG_RESET) 804 return DROP; 805 if (segment.flags & TCP_FLAG_ACKNOWLEDGE) 806 return DROP | RESET; 807 if ((segment.flags & TCP_FLAG_SYNCHRONIZE) == 0) 808 return DROP; 809 810 // TODO: drop broadcast/multicast 811 812 // spawn new endpoint for accept() 813 net_socket *newSocket; 814 if (gSocketModule->spawn_pending_socket(socket, &newSocket) < B_OK) 815 return DROP; 816 817 return ((TCPEndpoint *)newSocket->first_protocol)->Spawn(this, 818 segment, buffer); 819 } 820 821 822 int32 823 TCPEndpoint::Spawn(TCPEndpoint *parent, tcp_segment_header &segment, 824 net_buffer *buffer) 825 { 826 RecursiveLocker _(fLock); 827 828 // TODO error checking 829 ProtocolSocket::Open(); 830 831 fState = SYNCHRONIZE_RECEIVED; 832 fManager = parent->fManager; 833 834 LocalAddress().SetTo(&buffer->destination); 835 PeerAddress().SetTo(&buffer->source); 836 837 TRACE("Spawn()"); 838 839 // TODO: proper error handling! 840 if (fManager->BindChild(this) < B_OK) 841 return DROP; 842 843 if (_PrepareSendPath(*PeerAddress()) < B_OK) 844 return DROP; 845 846 fOptions = parent->fOptions; 847 fAcceptSemaphore = parent->fAcceptSemaphore; 848 849 _PrepareReceivePath(segment); 850 851 // send SYN+ACK 852 if (_SendQueued() < B_OK) 853 return DROP; 854 855 segment.flags &= ~TCP_FLAG_SYNCHRONIZE; 856 // we handled this flag now, it must not be set for further processing 857 858 return _Receive(segment, buffer); 859 } 860 861 862 void 863 TCPEndpoint::DumpInternalState() const 864 { 865 kprintf("Lock: { sem: %ld, holder: %ld, recursion: %i }\n", 866 fLock.sem, fLock.holder, fLock.recursion); 867 kprintf("AcceptSem: %ld\n", fAcceptSemaphore); 868 kprintf("Options: 0x%lx\n", (uint32)fOptions); 869 kprintf("SendWindowShift: %lu\n", (uint32)fSendWindowShift); 870 kprintf("ReceiveWindowShift: %lu\n", (uint32)fReceiveWindowShift); 871 kprintf("SendUnacknowledged: %lu\n", (uint32)fSendUnacknowledged); 872 kprintf("SendNext: %lu\n", (uint32)fSendNext); 873 kprintf("SendMax: %lu\n", (uint32)fSendMax); 874 kprintf("SendWindow: %lu\n", fSendWindow); 875 kprintf("SendMaxWindow: %lu\n", fSendMaxWindow); 876 kprintf("SendMaxSegmentSize: %lu\n", fSendMaxSegmentSize); 877 kprintf("Send-Q: %lu / %lu\n", fSendQueue.Used(), fSendQueue.Size()); 878 kprintf("LastAcknowledgeSent: %lu\n", (uint32)fLastAcknowledgeSent); 879 kprintf("InitialSendSequence: %lu\n", (uint32)fInitialSendSequence); 880 kprintf("DuplicateAcknowledgeCount: %lu\n", fDuplicateAcknowledgeCount); 881 kprintf("ReceiveNext: %lu\n", (uint32)fReceiveNext); 882 kprintf("ReceiveMaxAdvertised: %lu\n", (uint32)fReceiveMaxAdvertised); 883 kprintf("ReceiveWindow: %lu\n", (uint32)fReceiveWindow); 884 kprintf("ReceiveMaxSegmentSize: %lu\n", (uint32)fReceiveMaxSegmentSize); 885 kprintf("Recv-Q: %lu / %lu\n", fReceiveQueue.Available(), 886 fReceiveQueue.Size()); 887 kprintf("InitialReceiveSequence: %lu\n", (uint32)fInitialReceiveSequence); 888 kprintf("RoundTripTime: %ld (dev %ld)\n", fRoundTripTime, 889 fRoundTripDeviation); 890 kprintf("RetransmitTimeout: %llu\n", (uint64)fRetransmitTimeout); 891 kprintf("CongestionWindow: %lu\n", fCongestionWindow); 892 kprintf("SlowStartThreshold: %lu\n", fSlowStartThreshold); 893 kprintf("State: %s\n", name_for_state(fState)); 894 kprintf("Flags: 0x%lx\n", fFlags); 895 } 896 897 898 int32 899 TCPEndpoint::_SynchronizeSentReceive(tcp_segment_header &segment, net_buffer *buffer) 900 { 901 TRACE("SynchronizeSentReceive()"); 902 903 if ((segment.flags & TCP_FLAG_ACKNOWLEDGE) != 0 904 && (fInitialSendSequence >= segment.acknowledge 905 || fSendMax < segment.acknowledge)) 906 return DROP | RESET; 907 908 if (segment.flags & TCP_FLAG_RESET) { 909 fError = ECONNREFUSED; 910 fState = CLOSED; 911 return DROP; 912 } 913 914 if ((segment.flags & TCP_FLAG_SYNCHRONIZE) == 0) 915 return DROP; 916 917 fSendUnacknowledged = segment.acknowledge; 918 _PrepareReceivePath(segment); 919 920 if (segment.flags & TCP_FLAG_ACKNOWLEDGE) { 921 _MarkEstablished(); 922 } else { 923 // simultaneous open 924 fState = SYNCHRONIZE_RECEIVED; 925 } 926 927 segment.flags &= ~TCP_FLAG_SYNCHRONIZE; 928 // we handled this flag now, it must not be set for further processing 929 930 return _Receive(segment, buffer) | IMMEDIATE_ACKNOWLEDGE; 931 } 932 933 934 int32 935 TCPEndpoint::SegmentReceived(tcp_segment_header &segment, net_buffer *buffer) 936 { 937 RecursiveLocker locker(fLock); 938 939 TRACE("SegmentReceived(): buffer %p (%lu bytes) address %s to %s", 940 buffer, buffer->size, PrintAddress(&buffer->source), 941 PrintAddress(&buffer->destination)); 942 TRACE(" flags 0x%x, seq %lu, ack %lu, wnd %lu", 943 segment.flags, segment.sequence, segment.acknowledge, 944 (uint32)segment.advertised_window << fSendWindowShift); 945 946 int32 segmentAction = DROP; 947 948 switch (fState) { 949 case LISTEN: 950 segmentAction = _ListenReceive(segment, buffer); 951 break; 952 953 case SYNCHRONIZE_SENT: 954 segmentAction = _SynchronizeSentReceive(segment, buffer); 955 break; 956 957 case SYNCHRONIZE_RECEIVED: 958 case ESTABLISHED: 959 case FINISH_RECEIVED: 960 case WAIT_FOR_FINISH_ACKNOWLEDGE: 961 case FINISH_SENT: 962 case FINISH_ACKNOWLEDGED: 963 case CLOSING: 964 case TIME_WAIT: 965 case CLOSED: 966 segmentAction = _SegmentReceived(segment, buffer); 967 break; 968 } 969 970 // process acknowledge action as asked for by the *Receive() method 971 if (segmentAction & IMMEDIATE_ACKNOWLEDGE) 972 SendAcknowledge(true); 973 else if (segmentAction & ACKNOWLEDGE) 974 DelayedAcknowledge(); 975 976 return segmentAction; 977 } 978 979 int32 980 TCPEndpoint::_SegmentReceived(tcp_segment_header &segment, net_buffer *buffer) 981 { 982 uint32 advertisedWindow = (uint32)segment.advertised_window << fSendWindowShift; 983 984 // First, handle the most common case for uni-directional data transfer 985 // (known as header prediction - the segment must not change the window, 986 // and must be the expected sequence, and contain no control flags) 987 988 if (fState == ESTABLISHED 989 && segment.AcknowledgeOnly() 990 && fReceiveNext == segment.sequence 991 && advertisedWindow > 0 && advertisedWindow == fSendWindow 992 && fSendNext == fSendMax) { 993 994 _UpdateTimestamps(segment, buffer->size); 995 996 if (buffer->size == 0) { 997 // this is a pure acknowledge segment - we're on the sending end 998 if (fSendUnacknowledged < segment.acknowledge 999 && fSendMax >= segment.acknowledge) { 1000 _Acknowledged(segment); 1001 return DROP; 1002 } 1003 } else if (segment.acknowledge == fSendUnacknowledged 1004 && fReceiveQueue.IsContiguous() 1005 && fReceiveQueue.Free() >= buffer->size 1006 && !(fFlags & FLAG_NO_RECEIVE)) { 1007 _AddData(segment, buffer); 1008 _NotifyReader(); 1009 return KEEP | ((segment.flags & TCP_FLAG_PUSH) ? 1010 IMMEDIATE_ACKNOWLEDGE : ACKNOWLEDGE); 1011 } 1012 } 1013 1014 // The fast path was not applicable, so we continue with the standard 1015 // processing of the incoming segment 1016 1017 if (fState != SYNCHRONIZE_SENT && fState != LISTEN && fState != CLOSED) { 1018 // 1. check sequence number 1019 if (!segment_in_sequence(segment, buffer->size, fReceiveNext, 1020 fReceiveWindow)) { 1021 TRACE(" Receive(): segment out of window, next: %lu wnd: %lu", 1022 (uint32)fReceiveNext, fReceiveWindow); 1023 if (segment.flags & TCP_FLAG_RESET) 1024 return DROP; 1025 return DROP | IMMEDIATE_ACKNOWLEDGE; 1026 } 1027 } 1028 1029 return _Receive(segment, buffer); 1030 } 1031 1032 1033 // #pragma mark - send 1034 1035 1036 inline uint8 1037 TCPEndpoint::_CurrentFlags() 1038 { 1039 // we don't set FLAG_FINISH here, instead we do it 1040 // conditionally below depending if we are sending 1041 // the last bytes of the send queue. 1042 1043 switch (fState) { 1044 case CLOSED: 1045 return TCP_FLAG_RESET | TCP_FLAG_ACKNOWLEDGE; 1046 1047 case SYNCHRONIZE_SENT: 1048 return TCP_FLAG_SYNCHRONIZE; 1049 case SYNCHRONIZE_RECEIVED: 1050 return TCP_FLAG_SYNCHRONIZE | TCP_FLAG_ACKNOWLEDGE; 1051 1052 case ESTABLISHED: 1053 case FINISH_RECEIVED: 1054 case FINISH_ACKNOWLEDGED: 1055 case TIME_WAIT: 1056 case WAIT_FOR_FINISH_ACKNOWLEDGE: 1057 case FINISH_SENT: 1058 case CLOSING: 1059 return TCP_FLAG_ACKNOWLEDGE; 1060 1061 default: 1062 return B_ERROR; 1063 } 1064 } 1065 1066 1067 inline bool 1068 TCPEndpoint::_ShouldSendSegment(tcp_segment_header &segment, uint32 length, 1069 uint32 segmentMaxSize, uint32 flightSize) 1070 { 1071 if (length > 0) { 1072 // Avoid the silly window syndrome - we only send a segment in case: 1073 // - we have a full segment to send, or 1074 // - we're at the end of our buffer queue, or 1075 // - the buffer is at least larger than half of the maximum send window, or 1076 // - we're retransmitting data 1077 if (length == segmentMaxSize 1078 || (fOptions & TCP_NODELAY) != 0 1079 || tcp_sequence(fSendNext + length) == fSendQueue.LastSequence() 1080 || (fSendMaxWindow > 0 && length >= fSendMaxWindow / 2)) 1081 return true; 1082 } 1083 1084 // check if we need to send a window update to the peer 1085 if (segment.advertised_window > 0) { 1086 // correct the window to take into account what already has been advertised 1087 uint32 window = (segment.advertised_window << fReceiveWindowShift) 1088 - (fReceiveMaxAdvertised - fReceiveNext); 1089 1090 // if we can advertise a window larger than twice the maximum segment 1091 // size, or half the maximum buffer size we send a window update 1092 if (window >= (fReceiveMaxSegmentSize << 1) 1093 || window >= (socket->receive.buffer_size >> 1)) 1094 return true; 1095 } 1096 1097 if ((segment.flags & (TCP_FLAG_SYNCHRONIZE | TCP_FLAG_FINISH | TCP_FLAG_RESET)) != 0) 1098 return true; 1099 1100 // there is no reason to send a segment just now 1101 return false; 1102 } 1103 1104 1105 status_t 1106 TCPEndpoint::_SendQueued(bool force) 1107 { 1108 return _SendQueued(force, fSendWindow); 1109 } 1110 1111 1112 /*! 1113 Sends one or more TCP segments with the data waiting in the queue, or some 1114 specific flags that need to be sent. 1115 */ 1116 status_t 1117 TCPEndpoint::_SendQueued(bool force, uint32 sendWindow) 1118 { 1119 if (fRoute == NULL) 1120 return B_ERROR; 1121 1122 // in passive state? 1123 if (fState == LISTEN) 1124 return B_ERROR; 1125 1126 tcp_segment_header segment(_CurrentFlags()); 1127 1128 if ((fOptions & TCP_NOOPT) == 0) { 1129 if (fFlags & FLAG_OPTION_TIMESTAMP) { 1130 segment.options |= TCP_HAS_TIMESTAMPS; 1131 segment.timestamp_reply = fReceivedTimestamp; 1132 segment.timestamp_value = tcp_now(); 1133 } 1134 1135 if ((segment.flags & TCP_FLAG_SYNCHRONIZE) 1136 && (fSendNext == fInitialSendSequence)) { 1137 // add connection establishment options 1138 segment.max_segment_size = fReceiveMaxSegmentSize; 1139 if (fFlags & FLAG_OPTION_WINDOW_SCALE) { 1140 segment.options |= TCP_HAS_WINDOW_SCALE; 1141 segment.window_shift = fReceiveWindowShift; 1142 } 1143 } 1144 } 1145 1146 size_t availableBytes = fReceiveQueue.Free(); 1147 if (fFlags & FLAG_OPTION_WINDOW_SCALE) 1148 segment.advertised_window = availableBytes >> fReceiveWindowShift; 1149 else 1150 segment.advertised_window = min_c(TCP_MAX_WINDOW, availableBytes); 1151 1152 segment.acknowledge = fReceiveNext; 1153 segment.urgent_offset = 0; 1154 1155 if (fCongestionWindow > 0 && fCongestionWindow < sendWindow) 1156 sendWindow = fCongestionWindow; 1157 1158 // SND.UNA SND.NXT SND.MAX 1159 // | | | 1160 // v v v 1161 // ----------------------------------- 1162 // | effective window | 1163 // ----------------------------------- 1164 1165 // Flight size represents the window of data which is currently in the 1166 // ether. We should never send data such as the flight size becomes larger 1167 // than the effective window. Note however that the effective window may be 1168 // reduced (by congestion for instance), so at some point in time flight 1169 // size may be larger than the currently calculated window. 1170 1171 uint32 flightSize = fSendMax - fSendUnacknowledged; 1172 uint32 consumedWindow = fSendNext - fSendUnacknowledged; 1173 1174 if (consumedWindow > sendWindow) { 1175 sendWindow = 0; 1176 // TODO enter persist state? try to get a window update. 1177 } else 1178 sendWindow -= consumedWindow; 1179 1180 if (force && sendWindow == 0 && fSendNext <= fSendQueue.LastSequence()) { 1181 // send one byte of data to ask for a window update 1182 // (triggered by the persist timer) 1183 sendWindow = 1; 1184 } 1185 1186 uint32 length = min_c(fSendQueue.Available(fSendNext), sendWindow); 1187 tcp_sequence previousSendNext = fSendNext; 1188 1189 do { 1190 uint32 segmentMaxSize = fSendMaxSegmentSize 1191 - tcp_options_length(segment); 1192 uint32 segmentLength = min_c(length, segmentMaxSize); 1193 1194 if (fSendNext + segmentLength == fSendQueue.LastSequence()) { 1195 if (state_needs_finish(fState)) 1196 segment.flags |= TCP_FLAG_FINISH; 1197 if (length > 0) 1198 segment.flags |= TCP_FLAG_PUSH; 1199 } 1200 1201 // Determine if we should really send this segment 1202 if (!force && !_ShouldSendSegment(segment, segmentLength, 1203 segmentMaxSize, flightSize)) { 1204 if (fSendQueue.Available() 1205 && !gStackModule->is_timer_active(&fPersistTimer) 1206 && !gStackModule->is_timer_active(&fRetransmitTimer)) 1207 _StartPersistTimer(); 1208 break; 1209 } 1210 1211 net_buffer *buffer = gBufferModule->create(256); 1212 if (buffer == NULL) 1213 return B_NO_MEMORY; 1214 1215 status_t status = B_OK; 1216 if (segmentLength > 0) 1217 fSendQueue.Get(buffer, fSendNext, segmentLength); 1218 if (status < B_OK) { 1219 gBufferModule->free(buffer); 1220 return status; 1221 } 1222 1223 LocalAddress().CopyTo(&buffer->source); 1224 PeerAddress().CopyTo(&buffer->destination); 1225 1226 uint32 size = buffer->size; 1227 segment.sequence = fSendNext; 1228 1229 TRACE("SendQueued(): buffer %p (%lu bytes) address %s to %s", 1230 buffer, buffer->size, PrintAddress(&buffer->source), 1231 PrintAddress(&buffer->destination)); 1232 TRACE(" flags 0x%x, seq %lu, ack %lu, rwnd %hu, cwnd %lu" 1233 ", ssthresh %lu", segment.flags, segment.sequence, 1234 segment.acknowledge, segment.advertised_window, 1235 fCongestionWindow, fSlowStartThreshold); 1236 TRACE(" len %lu first %lu last %lu", segmentLength, 1237 (uint32)fSendQueue.FirstSequence(), 1238 (uint32)fSendQueue.LastSequence()); 1239 1240 PROBE(buffer, sendWindow); 1241 sendWindow -= buffer->size; 1242 1243 status = add_tcp_header(AddressModule(), segment, buffer); 1244 if (status != B_OK) { 1245 gBufferModule->free(buffer); 1246 return status; 1247 } 1248 1249 // Update send status - we need to do this before we send the data 1250 // for local connections as the answer is directly handled 1251 1252 if (segment.flags & TCP_FLAG_SYNCHRONIZE) { 1253 segment.options &= ~TCP_HAS_WINDOW_SCALE; 1254 segment.max_segment_size = 0; 1255 size++; 1256 } 1257 1258 if (segment.flags & TCP_FLAG_FINISH) 1259 size++; 1260 1261 uint32 sendMax = fSendMax; 1262 fSendNext += size; 1263 if (fSendMax < fSendNext) 1264 fSendMax = fSendNext; 1265 1266 fReceiveMaxAdvertised = fReceiveNext 1267 + ((uint32)segment.advertised_window << fReceiveWindowShift); 1268 1269 status = next->module->send_routed_data(next, fRoute, buffer); 1270 if (status < B_OK) { 1271 gBufferModule->free(buffer); 1272 1273 fSendNext = segment.sequence; 1274 fSendMax = sendMax; 1275 // restore send status 1276 return status; 1277 } 1278 1279 if (segment.flags & TCP_FLAG_ACKNOWLEDGE) 1280 fLastAcknowledgeSent = segment.acknowledge; 1281 1282 length -= segmentLength; 1283 segment.flags &= ~(TCP_FLAG_SYNCHRONIZE | TCP_FLAG_RESET | TCP_FLAG_FINISH); 1284 } while (length > 0); 1285 1286 // if we sent data from the beggining of the send queue, 1287 // start the retransmition timer 1288 if (previousSendNext == fSendUnacknowledged 1289 && fSendNext > previousSendNext) { 1290 TRACE(" SendQueue(): set retransmit timer with rto %llu", 1291 fRetransmitTimeout); 1292 1293 gStackModule->set_timer(&fRetransmitTimer, fRetransmitTimeout); 1294 } 1295 1296 return B_OK; 1297 } 1298 1299 1300 int 1301 TCPEndpoint::_GetMSS(const sockaddr *address) const 1302 { 1303 return next->module->get_mtu(next, (sockaddr *)address) - sizeof(tcp_header); 1304 } 1305 1306 1307 status_t 1308 TCPEndpoint::_ShutdownEgress(bool closing) 1309 { 1310 tcp_state previousState = fState; 1311 1312 if (fState == SYNCHRONIZE_RECEIVED || fState == ESTABLISHED) 1313 fState = FINISH_SENT; 1314 else if (fState == FINISH_RECEIVED) 1315 fState = WAIT_FOR_FINISH_ACKNOWLEDGE; 1316 else 1317 return B_OK; 1318 1319 status_t status = _SendQueued(); 1320 if (status != B_OK) { 1321 fState = previousState; 1322 return status; 1323 } 1324 1325 return B_OK; 1326 } 1327 1328 1329 ssize_t 1330 TCPEndpoint::_AvailableData() const 1331 { 1332 // TODO: Refer to the FLAG_NO_RECEIVE comment above regarding 1333 // the application of FLAG_NO_RECEIVE in listen()ing 1334 // sockets. 1335 if (fState == LISTEN) 1336 return gSocketModule->count_connected(socket); 1337 else if (fState == SYNCHRONIZE_SENT) 1338 return 0; 1339 1340 ssize_t availableData = fReceiveQueue.Available(); 1341 1342 if (availableData == 0 && !_ShouldReceive()) 1343 return ENOTCONN; 1344 1345 return availableData; 1346 } 1347 1348 1349 void 1350 TCPEndpoint::_NotifyReader() 1351 { 1352 fReceiveList.Signal(); 1353 gSocketModule->notify(socket, B_SELECT_READ, _AvailableData()); 1354 } 1355 1356 1357 bool 1358 TCPEndpoint::_ShouldReceive() const 1359 { 1360 if (fFlags & FLAG_NO_RECEIVE) 1361 return false; 1362 1363 return fState == ESTABLISHED || fState == FINISH_SENT 1364 || fState == FINISH_ACKNOWLEDGED; 1365 } 1366 1367 1368 int32 1369 TCPEndpoint::_Receive(tcp_segment_header &segment, net_buffer *buffer) 1370 { 1371 uint32 advertisedWindow = (uint32)segment.advertised_window << fSendWindowShift; 1372 1373 size_t segmentLength = buffer->size; 1374 1375 if (segment.flags & TCP_FLAG_RESET) { 1376 // is this a valid reset? 1377 if (fLastAcknowledgeSent <= segment.sequence 1378 && tcp_sequence(segment.sequence) 1379 < (fLastAcknowledgeSent + fReceiveWindow)) { 1380 if (fState == SYNCHRONIZE_RECEIVED) 1381 fError = ECONNREFUSED; 1382 else if (fState == CLOSING || fState == TIME_WAIT 1383 || fState == WAIT_FOR_FINISH_ACKNOWLEDGE) 1384 fError = ENOTCONN; 1385 else 1386 fError = ECONNRESET; 1387 1388 _NotifyReader(); 1389 fState = CLOSED; 1390 } 1391 1392 return DROP; 1393 } 1394 1395 if ((segment.flags & TCP_FLAG_SYNCHRONIZE) != 0 1396 || (fState == SYNCHRONIZE_RECEIVED 1397 && (fInitialReceiveSequence > segment.sequence 1398 || (segment.flags & TCP_FLAG_ACKNOWLEDGE) != 0 1399 && (fSendUnacknowledged > segment.acknowledge 1400 || fSendMax < segment.acknowledge)))) { 1401 // reset the connection - either the initial SYN was faulty, or we 1402 // received a SYN within the data stream 1403 return DROP | RESET; 1404 } 1405 1406 fReceiveWindow = max_c(fReceiveQueue.Free(), fReceiveWindow); 1407 // the window must not shrink 1408 1409 // trim buffer to be within the receive window 1410 int32 drop = fReceiveNext - segment.sequence; 1411 if (drop > 0) { 1412 if ((uint32)drop > buffer->size 1413 || ((uint32)drop == buffer->size 1414 && (segment.flags & TCP_FLAG_FINISH) == 0)) { 1415 // don't accidently remove a FIN we shouldn't remove 1416 segment.flags &= ~TCP_FLAG_FINISH; 1417 drop = buffer->size; 1418 } 1419 1420 // remove duplicate data at the start 1421 TRACE("* remove %ld bytes from the start", drop); 1422 gBufferModule->remove_header(buffer, drop); 1423 segment.sequence += drop; 1424 } 1425 1426 int32 action = KEEP; 1427 1428 drop = segment.sequence + buffer->size - (fReceiveNext + fReceiveWindow); 1429 if (drop > 0) { 1430 // remove data exceeding our window 1431 if ((uint32)drop >= buffer->size) { 1432 // if we can accept data, or the segment is not what we'd expect, 1433 // drop the segment (an immediate acknowledge is always triggered) 1434 if (fReceiveWindow != 0 || segment.sequence != fReceiveNext) 1435 return DROP | IMMEDIATE_ACKNOWLEDGE; 1436 1437 action |= IMMEDIATE_ACKNOWLEDGE; 1438 } 1439 1440 if ((segment.flags & TCP_FLAG_FINISH) != 0) { 1441 // we need to remove the finish, too, as part of the data 1442 drop--; 1443 } 1444 1445 segment.flags &= ~(TCP_FLAG_FINISH | TCP_FLAG_PUSH); 1446 TRACE("* remove %ld bytes from the end", drop); 1447 gBufferModule->remove_trailer(buffer, drop); 1448 } 1449 1450 if (advertisedWindow > fSendWindow) 1451 TRACE(" Receive(): Window update %lu -> %lu", fSendWindow, 1452 advertisedWindow); 1453 1454 fSendWindow = advertisedWindow; 1455 if (advertisedWindow > fSendMaxWindow) 1456 fSendMaxWindow = advertisedWindow; 1457 1458 // Then look at the acknowledgement for any updates 1459 1460 if ((segment.flags & TCP_FLAG_ACKNOWLEDGE) != 0) { 1461 // process acknowledged data 1462 if (fState == SYNCHRONIZE_RECEIVED) 1463 _MarkEstablished(); 1464 1465 if (fSendMax < segment.acknowledge || fState == TIME_WAIT) 1466 return DROP | IMMEDIATE_ACKNOWLEDGE; 1467 1468 if (segment.acknowledge < fSendUnacknowledged) { 1469 if (buffer->size == 0 && advertisedWindow == fSendWindow 1470 && (segment.flags & TCP_FLAG_FINISH) == 0) { 1471 TRACE("Receive(): duplicate ack!"); 1472 1473 _DuplicateAcknowledge(segment); 1474 } 1475 1476 return DROP; 1477 } else { 1478 // this segment acknowledges in flight data 1479 1480 if (fDuplicateAcknowledgeCount >= 3) { 1481 // deflate the window. 1482 fCongestionWindow = fSlowStartThreshold; 1483 } 1484 1485 fDuplicateAcknowledgeCount = 0; 1486 1487 if (fSendMax == segment.acknowledge) 1488 TRACE("Receive(): all inflight data ack'd!"); 1489 1490 if (segment.acknowledge > fSendQueue.LastSequence() 1491 && fState > ESTABLISHED) { 1492 TRACE("Receive(): FIN has been acknowledged!"); 1493 1494 switch (fState) { 1495 case FINISH_SENT: 1496 fState = FINISH_ACKNOWLEDGED; 1497 break; 1498 case CLOSING: 1499 fState = TIME_WAIT; 1500 _EnterTimeWait(); 1501 return DROP; 1502 case WAIT_FOR_FINISH_ACKNOWLEDGE: 1503 fState = CLOSED; 1504 break; 1505 1506 default: 1507 break; 1508 } 1509 } 1510 1511 if (fState != CLOSED) 1512 _Acknowledged(segment); 1513 } 1514 } 1515 1516 if (segment.flags & TCP_FLAG_URGENT) { 1517 if (fState == ESTABLISHED || fState == FINISH_SENT 1518 || fState == FINISH_ACKNOWLEDGED) { 1519 // TODO: Handle urgent data: 1520 // - RCV.UP <- max(RCV.UP, SEG.UP) 1521 // - signal the user that urgent data is available (SIGURG) 1522 } 1523 } 1524 1525 bool notify = false; 1526 1527 if (buffer->size > 0 && _ShouldReceive()) { 1528 _AddData(segment, buffer); 1529 notify = true; 1530 } else 1531 action = (action & ~KEEP) | DROP; 1532 1533 if (segment.flags & TCP_FLAG_FINISH) { 1534 segmentLength++; 1535 if (fState != CLOSED && fState != LISTEN && fState != SYNCHRONIZE_SENT) { 1536 TRACE("Receive(): peer is finishing connection!"); 1537 fReceiveNext++; 1538 notify = true; 1539 1540 // FIN implies PSH 1541 fReceiveQueue.SetPushPointer(); 1542 1543 // we'll reply immediatly to the FIN if we are not 1544 // transitioning to TIME WAIT so we immediatly ACK it. 1545 action |= IMMEDIATE_ACKNOWLEDGE; 1546 1547 // other side is closing connection; change states 1548 switch (fState) { 1549 case ESTABLISHED: 1550 case SYNCHRONIZE_RECEIVED: 1551 fState = FINISH_RECEIVED; 1552 break; 1553 case FINISH_SENT: 1554 // simultaneous close 1555 fState = CLOSING; 1556 break; 1557 case FINISH_ACKNOWLEDGED: 1558 fState = TIME_WAIT; 1559 _EnterTimeWait(); 1560 break; 1561 default: 1562 break; 1563 } 1564 } 1565 } 1566 1567 if (notify) 1568 _NotifyReader(); 1569 1570 if (buffer->size > 0 || (segment.flags & TCP_FLAG_SYNCHRONIZE) != 0) 1571 action |= ACKNOWLEDGE; 1572 1573 _UpdateTimestamps(segment, segmentLength); 1574 1575 TRACE("Receive() Action %ld", action); 1576 1577 return action; 1578 } 1579 1580 1581 void 1582 TCPEndpoint::_UpdateTimestamps(tcp_segment_header &segment, 1583 size_t segmentLength) 1584 { 1585 if (fFlags & FLAG_OPTION_TIMESTAMP) { 1586 tcp_sequence sequence(segment.sequence); 1587 1588 if ((fLastAcknowledgeSent >= sequence 1589 && fLastAcknowledgeSent < (sequence + segmentLength))) 1590 fReceivedTimestamp = segment.timestamp_value; 1591 } 1592 } 1593 1594 1595 void 1596 TCPEndpoint::_MarkEstablished() 1597 { 1598 fState = ESTABLISHED; 1599 1600 if (socket->parent != NULL) { 1601 gSocketModule->set_connected(socket); 1602 release_sem_etc(fAcceptSemaphore, 1, B_DO_NOT_RESCHEDULE); 1603 } 1604 1605 fSendList.Signal(); 1606 } 1607 1608 1609 status_t 1610 TCPEndpoint::_WaitForEstablished(RecursiveLocker &locker, bigtime_t timeout) 1611 { 1612 while (fState != ESTABLISHED) { 1613 status_t status = fSendList.Wait(locker, timeout); 1614 if (status < B_OK) 1615 return status; 1616 } 1617 1618 return B_OK; 1619 } 1620 1621 1622 void 1623 TCPEndpoint::_AddData(tcp_segment_header &segment, net_buffer *buffer) 1624 { 1625 fReceiveQueue.Add(buffer, segment.sequence); 1626 fReceiveNext = fReceiveQueue.NextSequence(); 1627 1628 TRACE(" _AddData(): adding data, receive next = %lu. Now have %lu bytes.", 1629 (uint32)fReceiveNext, fReceiveQueue.Available()); 1630 1631 if (segment.flags & TCP_FLAG_PUSH) 1632 fReceiveQueue.SetPushPointer(); 1633 } 1634 1635 1636 void 1637 TCPEndpoint::_PrepareReceivePath(tcp_segment_header &segment) 1638 { 1639 fInitialReceiveSequence = segment.sequence; 1640 1641 // count the received SYN 1642 segment.sequence++; 1643 1644 fReceiveNext = segment.sequence; 1645 fReceiveQueue.SetInitialSequence(segment.sequence); 1646 1647 if ((fOptions & TCP_NOOPT) == 0) { 1648 if (segment.max_segment_size > 0) 1649 fSendMaxSegmentSize = segment.max_segment_size; 1650 1651 if (segment.options & TCP_HAS_WINDOW_SCALE) { 1652 fFlags |= FLAG_OPTION_WINDOW_SCALE; 1653 fSendWindowShift = segment.window_shift; 1654 } else { 1655 fFlags &= ~FLAG_OPTION_WINDOW_SCALE; 1656 fReceiveWindowShift = 0; 1657 } 1658 1659 if (segment.options & TCP_HAS_TIMESTAMPS) { 1660 fFlags |= FLAG_OPTION_TIMESTAMP; 1661 fReceivedTimestamp = segment.timestamp_value; 1662 } else 1663 fFlags &= ~FLAG_OPTION_TIMESTAMP; 1664 } 1665 1666 fCongestionWindow = 2 * fSendMaxSegmentSize; 1667 fSlowStartThreshold = (uint32)segment.advertised_window << fSendWindowShift; 1668 } 1669 1670 1671 status_t 1672 TCPEndpoint::_PrepareSendPath(const sockaddr *peer) 1673 { 1674 if (fRoute == NULL) { 1675 fRoute = gDatalinkModule->get_route(Domain(), peer); 1676 if (fRoute == NULL) 1677 return ENETUNREACH; 1678 } 1679 1680 // make sure connection does not already exist 1681 status_t status = fManager->SetConnection(this, *LocalAddress(), peer, 1682 fRoute->interface->address); 1683 if (status < B_OK) 1684 return status; 1685 1686 fInitialSendSequence = system_time() >> 4; 1687 fSendNext = fInitialSendSequence; 1688 fSendUnacknowledged = fInitialSendSequence; 1689 fSendMax = fInitialSendSequence; 1690 1691 // we are counting the SYN here 1692 fSendQueue.SetInitialSequence(fSendNext + 1); 1693 1694 fReceiveMaxSegmentSize = _GetMSS(peer); 1695 1696 // Compute the window shift we advertise to our peer - if it doesn't support 1697 // this option, this will be reset to 0 (when its SYN is received) 1698 fReceiveWindowShift = 0; 1699 while (fReceiveWindowShift < TCP_MAX_WINDOW_SHIFT 1700 && (0xffffUL << fReceiveWindowShift) < socket->receive.buffer_size) { 1701 fReceiveWindowShift++; 1702 } 1703 1704 return B_OK; 1705 } 1706 1707 1708 void 1709 TCPEndpoint::_Acknowledged(tcp_segment_header &segment) 1710 { 1711 size_t previouslyUsed = fSendQueue.Used(); 1712 1713 fSendQueue.RemoveUntil(segment.acknowledge); 1714 fSendUnacknowledged = segment.acknowledge; 1715 1716 if (fSendNext < fSendUnacknowledged) 1717 fSendNext = fSendUnacknowledged; 1718 1719 if (fSendUnacknowledged == fSendMax) 1720 gStackModule->cancel_timer(&fRetransmitTimer); 1721 1722 if (fSendQueue.Used() < previouslyUsed) { 1723 // this ACK acknowledged data 1724 1725 if (segment.options & TCP_HAS_TIMESTAMPS) 1726 _UpdateSRTT(tcp_diff_timestamp(segment.timestamp_reply)); 1727 else { 1728 // TODO Fallback to RFC 793 type estimation 1729 } 1730 1731 if (is_writable(fState)) { 1732 // notify threads waiting on the socket to become writable again 1733 fSendList.Signal(); 1734 gSocketModule->notify(socket, B_SELECT_WRITE, fSendQueue.Used()); 1735 } 1736 1737 if (fCongestionWindow < fSlowStartThreshold) 1738 fCongestionWindow += fSendMaxSegmentSize; 1739 } 1740 1741 if (fCongestionWindow >= fSlowStartThreshold) { 1742 uint32 increment = fSendMaxSegmentSize * fSendMaxSegmentSize; 1743 1744 if (increment < fCongestionWindow) 1745 increment = 1; 1746 else 1747 increment /= fCongestionWindow; 1748 1749 fCongestionWindow += increment; 1750 } 1751 1752 // if there is data left to be send, send it now 1753 if (fSendQueue.Used() > 0) 1754 _SendQueued(); 1755 } 1756 1757 1758 void 1759 TCPEndpoint::_Retransmit() 1760 { 1761 TRACE("Retransmit()"); 1762 _ResetSlowStart(); 1763 fSendNext = fSendUnacknowledged; 1764 _SendQueued(); 1765 } 1766 1767 1768 void 1769 TCPEndpoint::_UpdateSRTT(int32 roundTripTime) 1770 { 1771 int32 rtt = roundTripTime; 1772 1773 // Update_SRTT() as per Van Jacobson 1774 rtt -= (fRoundTripTime / 8); 1775 fRoundTripTime += rtt; 1776 if (rtt < 0) 1777 rtt = -rtt; 1778 rtt -= (fRoundTripDeviation / 4); 1779 fRoundTripDeviation += rtt; 1780 1781 fRetransmitTimeout = ((fRoundTripTime / 4 + 1782 fRoundTripDeviation) / 2) * kTimestampFactor; 1783 1784 TRACE(" RTO is now %llu (after rtt %ldms)", fRetransmitTimeout, 1785 roundTripTime); 1786 } 1787 1788 1789 void 1790 TCPEndpoint::_ResetSlowStart() 1791 { 1792 fSlowStartThreshold = max_c((fSendMax - fSendUnacknowledged) / 2, 1793 2 * fSendMaxSegmentSize); 1794 fCongestionWindow = fSendMaxSegmentSize; 1795 } 1796 1797 1798 void 1799 TCPEndpoint::_DuplicateAcknowledge(tcp_segment_header &segment) 1800 { 1801 fDuplicateAcknowledgeCount++; 1802 1803 if (fDuplicateAcknowledgeCount < 3) 1804 return; 1805 else if (fDuplicateAcknowledgeCount == 3) { 1806 _ResetSlowStart(); 1807 fCongestionWindow = fSlowStartThreshold + 3 1808 * fSendMaxSegmentSize; 1809 fSendNext = segment.acknowledge; 1810 } else if (fDuplicateAcknowledgeCount > 3) 1811 fCongestionWindow += fSendMaxSegmentSize; 1812 1813 _SendQueued(); 1814 } 1815 1816 1817 // #pragma mark - timer 1818 1819 1820 /*static*/ void 1821 TCPEndpoint::_RetransmitTimer(net_timer *timer, void *data) 1822 { 1823 TCPEndpoint *endpoint = (TCPEndpoint *)data; 1824 1825 RecursiveLocker locker(endpoint->fLock); 1826 if (!locker.IsLocked()) 1827 return; 1828 1829 endpoint->_Retransmit(); 1830 } 1831 1832 1833 /*static*/ void 1834 TCPEndpoint::_PersistTimer(net_timer *timer, void *data) 1835 { 1836 TCPEndpoint *endpoint = (TCPEndpoint *)data; 1837 1838 RecursiveLocker locker(endpoint->fLock); 1839 if (!locker.IsLocked()) 1840 return; 1841 1842 endpoint->_SendQueued(true); 1843 } 1844 1845 1846 /*static*/ void 1847 TCPEndpoint::_DelayedAcknowledgeTimer(struct net_timer *timer, void *data) 1848 { 1849 TCPEndpoint *endpoint = (TCPEndpoint *)data; 1850 1851 RecursiveLocker locker(endpoint->fLock); 1852 if (!locker.IsLocked()) 1853 return; 1854 1855 endpoint->SendAcknowledge(true); 1856 } 1857 1858 1859 /*static*/ void 1860 TCPEndpoint::_TimeWaitTimer(struct net_timer *timer, void *data) 1861 { 1862 TCPEndpoint *endpoint = (TCPEndpoint *)data; 1863 1864 if (recursive_lock_lock(&endpoint->fLock) < B_OK) 1865 return; 1866 1867 endpoint->DeleteSocket(); 1868 } 1869 1870